In [4]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re # regular expressions
import matplotlib.pyplot as plt # drawing and visualizing data
import seaborn as sns # nicer plotting above matplotlib
import nltk # NLP library
from nltk.tokenize import word_tokenize
from tqdm.autonotebook import tqdm

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Goal for this notebook is to pull out topics from the conversations in movies

In [5]:
conversations = pd.read_csv(
    "/kaggle/input/movie-dialog-corpus/movie_conversations.tsv", 
    sep='\t', 
    encoding='ISO-8859-2',
    names = ['charID_1', 'charID_2', 'movieID', 'conversation']
)


lines = pd.read_csv(
    "/kaggle/input/movie-dialog-corpus/movie_lines.tsv", 
    encoding='utf-8-sig', 
    sep='\t', 
    error_bad_lines=False, 
    header = None,
    names = ['lineID', 'charID', 'movieID', 'charName', 'text'],
    index_col=['lineID']
)

characters = pd.read_csv(
    "/kaggle/input/movie-dialog-corpus/movie_characters_metadata.tsv", 
    sep='\t', 
    header = None,
    error_bad_lines=False,
    names = ['charID','charName','movieID','movieName','gender','score'],
    index_col=['charID']
)

titles = pd.read_csv(
    "/kaggle/input/movie-dialog-corpus/movie_titles_metadata.tsv",
    sep='\t',
    header=None,
    error_bad_lines=False,
    names=['movieID', 'title', 'year', 'ratingIMDB', 'votes', 'genresIMDB'],
    index_col=['movieID']
)

In [6]:
conversations

In [7]:
lines

In [8]:
characters

In [9]:
titles

# Conversations
We need to modify the conversation column to be a proper list, to allow us to query these conversations much easier

In [10]:
conversations['conversation'] = conversations['conversation'].map(lambda x: re.findall(r"\w+", x))

# I could not find another way to filter based on the length of the conversation other than creating another column
conversations['length'] = conversations['conversation'].apply(lambda x: len(x))

conversations.sort_values(by=['length'], ascending=False)

We can see the longest conversation was an 89 line interaction between two characters, but what about the distribution of interactions between characters?

In [11]:
print(conversations.length.describe())
fig, ax = plt.subplots(figsize=(15,5))
sns.violinplot(ax=ax, x=conversations.length, inner=None)

In [12]:
# conversation here should be one row of the conversations df. 
# TODO: should probably change this to allow different arguments
def view_convo(conversation, characters, movies, lines):
    charID_1, charID_2, movieID = conversation['charID_1'], conversation['charID_2'], conversation['movieID']
    
    char1 = characters.loc[charID_1].charName
    char2 = characters.loc[charID_2].charName
    movie = movies.loc[movieID].title
    
    convo_header = f"This conversation was between {char1} and {char2}, from movie : {movie}."
    print(convo_header)
    print(f"{'-' * len(convo_header)}")

    for lineID in conversation.conversation:
        line = lines.loc[lineID]
        print(f"{line.charName} : {line.text}")

In [13]:
view_convo(conversations.iloc[0], characters, titles, lines)

# Fixing some import issues with movie_lines.tsv
1. Some lines have no text, but thats ok, we will treat them as a mute reply
2. Some lines were parsed in such a way that they either did not get tab seperated and the whole row is actually stored in the lineID column
3. To further complicate \#2, we have some lineIDs that do not only contain the whole row info, but contain 100+ seperate lines.

To fix all three (#3 being an extension of \#2) We will build a malformed dictionary, and loop through this dictionary, fixing the lines DataFrame.


In [14]:
# this is to identify all the lines that do not have text
lines["type"] = lines["text"].apply(lambda x: type(x))

# we create our malformed dictionary with the correct lineID as the key, and the value being the full string from which we will correctly
# populate 
malformed_index_dict = {line.split('\t')[0]: line for line in lines[lines.type == type(float(1.))].index}

# the malformed dict has both correctly parsed but empty lines and broken lines that are all stored in lineID. Lets remove the empty lines
empty_lines = []
for k, v in malformed_index_dict.items():
    if k == v:
        empty_lines.append(k)

# we remove the empty lines and now 
for empty_k in empty_lines:
    malformed_index_dict.pop(empty_k, None)
    lines.loc[empty_k]['text'] = " "
    lines.loc[empty_k]['type'] = type("a")
    
# lets fix the indices first, then loop over again and update the values 
lines.rename(index={v: k for k, v in malformed_index_dict.items()}, inplace=True)

##################################### 

fixed = []

# first, lets go ahead and append the lines where the lineID contains all the data for a single line, and in a bad array, store the lineIDs 
# that contain the information for multiple lines
bad = []
for idx, bad_idx in malformed_index_dict.items():
    if len(bad_idx.split('\t')[1:] + [type(str)]) > 5:
        bad.append(idx)
    else:
        fixed.append(bad_idx.split('\t') + [type("a")])

# now fix and append the lineIDs that contain all the data for multiple lines into fixed     
for val in bad:
    for line in malformed_index_dict[val].split("\n"):
        if len(line.split('\t')[1:] + [type("a")]) > 5:
            fixed.append(line.split('\t')[:4] + ["".join(line.split('\t')[4:])] + [type("a")])
        else:
            fixed.append(line.split('\t') + [type("a")])

            
# so now lets make a dataframe out of fixed, and update lines with it
df_fixed = pd.DataFrame(fixed, columns=['lineID', 'charID', 'movieID', 'charName', 'text', 'type'])
df_fixed.set_index(['lineID'], inplace=True)

lines = pd.concat([df_fixed, lines])
lines = lines.groupby(lines.index).first().drop(columns=['type'])

# lines = pd.concat([df_fixed, lines]).drop_duplicates(subset=['lineID'])
# https://stackoverflow.com/questions/63842185/how-to-update-one-pandas-dataframe-with-another-dataframe-update-the-old-data-a

# Building a vocabulary

Obligatory word cloud using every single line

In [15]:
# Import the wordcloud library
from wordcloud import WordCloud
# Join the different processed titles together.
long_string = ','.join(list(lines.text.values))
# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=5000, contour_width=3, contour_color='steelblue')
# Generate a word cloud
wordcloud.generate(long_string)
# Visualize the word cloud
wordcloud.to_image()

Do we care about the following?
1. Getting rid of stop words? --- Probably not? 
2. Stemming our words? --- Probably not?
3. Getting rid of rare words? --- Probably not?
4. Lower casing everything? --- Yes
5. Punctuation type? --- Probably not? Just make all the punctuation "." for now

In [36]:
def process_line(line):
    line = re.sub(r'[,!?;-]', '.', line)
    return line.lower()
    line = word_tokenize(line)
    # return line

total_words = [process_line(line) for line in tqdm(lines.text)]
print(len(total_words))

In [17]:
lines['pText'] = lines['text'].apply(process_line)

We have an array of tokens per line. We can choose to append this to the DataFrame for easier access

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA

In [19]:
count_vectorizer = CountVectorizer(stop_words='english')
count_data = count_vectorizer.fit_transform(lines['pText'])

In [20]:
lda = LDA(n_components=5)
lda.fit(count_data)

In [21]:
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

print("Topics found via LDA:")
print_topics(lda, count_vectorizer, 10)

In [38]:
each_word = sum(total_words, [])

In [None]:
freq = nltk.FreqDist(each_word)