In this project, we'll deal with text from tv series and from the given text we want to:
- get word stem list ordered by frequency.
- show example sentences for each words according to the above stem list.
the results are save in a csv file and a txt file respectively.

In [1]:
import pysrt
from string import punctuation
import re
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from nltk.stem.snowball import SnowballStemmer

In [2]:
# use pysrt to read subtitle files in srt format.
def get_subs(path):
    subs = pysrt.open(path)
    return subs

In [3]:
# for each word in the subtitle, save its location (index of sentence in the subtitle).
def get_word_loc_dic(subs,num):
    punctuation_remove = punctuation.replace("'", "") 
    pattern = r"[{}]".format(punctuation_remove) 
    text = []
    for i in range(len(subs)):
        line = subs[i].text
        line = line.lower()
        line = re.sub(pattern,' ', line)
        line = re.sub('[0-9]+', '', line)
        words = line.split( )
        text.append(words)
    word_loc_dic = {}
    for index, row in enumerate(text):
        for word in row:
            if word in word_loc_dic:
                word_loc_dic[word].extend([(num,index)])
            else:
                word_loc_dic[word] = [(num, index)]
    return word_loc_dic

In [4]:
# if there are several subtitles and we save words and their locations in several dictionaries, we want to combine
# those dictionaries
def combine_dict(dict_list):
    dict_num = len(dict_list)
    combined_dict = dict_list[0]
    for dic in dict_list[1:]:
        for word, value in dic.items():
            if word in combined_dict:
                combined_dict[word] = combined_dict[word] + value
            else:
                combined_dict[word] = value
    return combined_dict

In [5]:
# use SnowballStemmer to get stems and corresponding words.
def get_stem_words_dic(dic):
    stemmer = SnowballStemmer("english")
    stem_words_dic = {}
    for word in dic.keys():
        stem = stemmer.stem(word)
        if stem in stem_words_dic:
            stem_words_dic[stem].append(word)
        else:
            stem_words_dic[stem] = [word]
    return stem_words_dic

In [6]:
# we want to generate a dataframe with columns 'stem', 'words' and 'stem_freq'. In order to do that, we extract 
# necessary data from the two dictionaries (stem_words_dic, word_loc_dic) we get from previous steps. 
def get_stem_words_freq_list(stem_words_dic, word_loc_dic):
    stem_words_freq_list = []
    for stem, words in stem_words_dic.items():
        stem_dict = {}
        stem_freq = 0
        for word in words:
            word_freq = len(word_loc_dic[word])
            stem_freq += word_freq
        stem_dict["stem"] = stem
        stem_dict["words"] = words
        stem_dict["stem_freq"] = stem_freq
        stem_words_freq_list.append(stem_dict)
    return stem_words_freq_list

In [7]:
# generate the dataframe and sort it according to 'stem_freq'.
def get_stem_words_freq_df(stem_words_freq_list):
    df = pd.DataFrame(stem_words_freq_list, columns = ['stem', 'words','stem_freq'])
    words_sum = df['stem_freq'].sum()
    df['stem_freq_pct'] = df['stem_freq']/words_sum
    df = df.sort_values(by = ['stem_freq', 'stem'], ascending = False).reset_index(drop = True)
    return df

In [8]:
# read subtitle files
subs_1 = get_subs('Subtitles\S1E1.srt')
subs_2 = get_subs('Subtitles\S1E2.srt')
subs_3 = get_subs('Subtitles\S1E3.srt')

In [9]:
# get word_location dictionaries and combine them into one.
word_loc_1 = get_word_loc_dic(subs_1,1)
word_loc_2 = get_word_loc_dic(subs_2,2)
word_loc_3 = get_word_loc_dic(subs_3,3)
word_loc_list = [word_loc_1,word_loc_2, word_loc_3]
word_loc_combined = combine_dict(word_loc_list)
# display first 5 items of word_loc_combined dict.
dict(word_loc_combined.items()[0:5])

{u'everybody': [(3, 392), (3, 466)],
 u'four': [(1, 133), (2, 225), (2, 416), (3, 100)],
 u'hate': [(1, 268), (1, 269), (3, 9), (3, 10), (3, 183)],
 u'personally': [(2, 452)],
 u'sleep': [(3, 44), (3, 45), (3, 49)]}

In [10]:
# get stem_words dictionary
stem_words_dic = get_stem_words_dic(word_loc_combined)
# display first 5 items of stem_word dict.
dict(stem_words_dic.items()[0:5])

{u'four': [u'four'],
 u'hate': [u'hate'],
 u'skeleton': [u'skeleton'],
 u'sleep': [u'sleep', u'sleeping'],
 u'sorri': [u'sorry']}

In [11]:
# get stem_words_freq dataframe
stem_words_freq_list = get_stem_words_freq_list(stem_words_dic, word_loc_combined)
stem_words_freq_df = get_stem_words_freq_df(stem_words_freq_list)
# display first 5 rows of stem_words_freq_df
stem_words_freq_df.head(5)

Unnamed: 0,stem,words,stem_freq,stem_freq_pct
0,i,[i],456,0.034916
1,you,[you],400,0.030628
2,to,[to],343,0.026263
3,the,[the],324,0.024809
4,it,"[it, its, it's]",266,0.020368


In [12]:
# since we want to summarize frequency of stems and we observe that there are contraction forms in the 'stem' column
# such as i'm and don't, we are going to split such forms and merge them to original stems.
def combine_lines_df(df, stem1, stem2):
    df_update = df.copy()
    row1_index = df[df['stem'] == stem1].index.values.astype(int)[0]    
    row2_index = df[df['stem'] == stem2].index.values.astype(int)[0]
    df_update.loc[row1_index, 'stem_freq'] += df_update.loc[row2_index, 'stem_freq']
    df_update.loc[row1_index, 'stem_freq_pct'] += df_update.loc[row2_index, 'stem_freq_pct']
    new_words_list = df_update.loc[row1_index,'words'] + df_update.loc[row2_index,'words']
    df_update.at[row1_index,'words'] = new_words_list
    return df_update

In [13]:
# deal with contractions and then drop them and reorder the dataframe
def combine_contraction_df(df_before, order = True):
    df = df_before.copy()
    df = combine_lines_df(df, 'i','i\'m')
    df = combine_lines_df(df, 'am','i\'m')
    df = combine_lines_df(df, 'do','don\'t')
    df = combine_lines_df(df, 'not','don\'t')
    df = combine_lines_df(df, 'you','you\'r')
    df = combine_lines_df(df, 'are','you\'r')
    df = combine_lines_df(df, 'can','can\'t')
    df = combine_lines_df(df, 'not','can\'t')
    df = combine_lines_df(df, 'i','i\'ll')
    df = combine_lines_df(df, 'will','i\'ll')
    df = combine_lines_df(df, 'did','didn\'t')
    df = combine_lines_df(df, 'not','didn\'t')
    df = combine_lines_df(df, 'we','we\'r')
    df = combine_lines_df(df, 'are','we\'r')
    df = combine_lines_df(df, 'i','i\'v')
    df = combine_lines_df(df, 'have','i\'v')
    df = combine_lines_df(df, 'do','doesn\'t')
    df = combine_lines_df(df, 'not','doesn\'t')
    df = combine_lines_df(df, 'you','you\'v')
    df = combine_lines_df(df, 'have','you\'v')
    df = combine_lines_df(df, 'i','i\'d')
    df = combine_lines_df(df, 'had','i\'d')
    df = combine_lines_df(df, 'will','won\'t')
    df = combine_lines_df(df, 'not','won\'t')
    df = combine_lines_df(df, 'they','they\'r')
    df = combine_lines_df(df, 'are','they\'r')
    df = combine_lines_df(df, 'is','isn\'t')
    df = combine_lines_df(df, 'not','isn\'t')
    df = combine_lines_df(df, 'could','couldn\'t')
    df = combine_lines_df(df, 'not','couldn\'t')
    df = combine_lines_df(df, 'we','we\'ll')
    df = combine_lines_df(df, 'will','we\'ll')
    df = combine_lines_df(df, 'was','wasn\'t')
    df = combine_lines_df(df, 'not','wasn\'t')
    df = combine_lines_df(df, 'you','you\'ll')
    df = combine_lines_df(df, 'will','you\'ll')
    df = combine_lines_df(df, 'would','wouldn\'t')
    df = combine_lines_df(df, 'not','wouldn\'t')
    df = combine_lines_df(df, 'you','you\'d')
    df = combine_lines_df(df, 'had','you\'d')
    df = combine_lines_df(df, 'have','haven\'t')
    df = combine_lines_df(df, 'not','haven\'t')
    df = combine_lines_df(df, 'should','shouldn\'t')
    df = combine_lines_df(df, 'not','shouldn\'t')
    df = combine_lines_df(df, 'are','aren\'t')
    df = combine_lines_df(df, 'not','aren\'t')
    df = combine_lines_df(df, 'we','we\'v')
    df = combine_lines_df(df, 'have','we\'v')
    df = combine_lines_df(df, 'he','he\'ll')
    df = combine_lines_df(df, 'will','he\'ll')
    df = combine_lines_df(df, 'she','she\'d')
    df = combine_lines_df(df, 'had','she\'d')
    df = combine_lines_df(df, 'we','we\'d')
    df = combine_lines_df(df, 'had','we\'d')
    df = combine_lines_df(df, 'they','they\'ll')
    df = combine_lines_df(df, 'will','they\'ll')
    df = combine_lines_df(df, 'it','it\'ll')
    df = combine_lines_df(df, 'will','it\'ll')
    df = combine_lines_df(df, 'she','she\'ll')
    df = combine_lines_df(df, 'will','she\'ll')
    df = combine_lines_df(df, 'he','he\'d')
    df = combine_lines_df(df, 'had','he\'d')
    df = combine_lines_df(df, 'they','they\'v')
    df = combine_lines_df(df, 'have','they\'v')
    df = combine_lines_df(df, 'were','weren\'t')
    df = combine_lines_df(df, 'not','weren\'t')
    df = combine_lines_df(df, 'would','would\'v')
    df = combine_lines_df(df, 'have','would\'v')
    df = combine_lines_df(df, 'has','hasn\'t')
    df = combine_lines_df(df, 'not','hasn\'t')
    df = combine_lines_df(df, 'could','could\'v')
    df = combine_lines_df(df, 'have','could\'v')
    df = combine_lines_df(df, 'had','hadn\'t')
    df = combine_lines_df(df, 'not','hadn\'t')
    df = combine_lines_df(df, 'should','should\'v')
    df = combine_lines_df(df, 'have','should\'v')
    df = combine_lines_df(df, 'must','must\'v')
    df = combine_lines_df(df, 'have','must\'v')
    df = combine_lines_df(df, 'i','i\'ii')
    df = combine_lines_df(df, 'will','i\'ii')
    df = combine_lines_df(df, 'they','they\'d')
    df = combine_lines_df(df, 'had','they\'d')
    df = combine_lines_df(df, 'who','who\'d')
    df = combine_lines_df(df, 'had','who\'d')
    df = combine_lines_df(df, 'this','this\'ll')
    df = combine_lines_df(df, 'will','this\'ll')
    df = combine_lines_df(df, 'that','that\'ll')
    df = combine_lines_df(df, 'will','that\'ll')    
    drop_stems = stem_words_freq_df[stem_words_freq_df['stem'].str.contains("\'")]['stem'].tolist()[:38]
    drop_stems.remove("ma'am")
    for stem in drop_stems:
        df = df[df['stem'] != stem]
    if order:
        df = df.sort_values(by = ['stem_freq', 'stem'], ascending = False).reset_index(drop = True)
    return df

In [14]:
# get clean dataframe after dealing with contractions
stem_words_freq_df_clean = combine_contraction_df(stem_words_freq_df)
# display first 5 rows of cleaned stem_words_freq dataframe
stem_words_freq_df_clean.head(5)

Unnamed: 0,stem,words,stem_freq,stem_freq_pct
0,i,"[i, i'm, i'll, i've, i'd, i'ii]",634,0.048545
1,you,"[you, you're, you've, you'll, you'd]",466,0.035681
2,to,[to],343,0.026263
3,the,[the],324,0.024809
4,not,"[not, don't, can't, didn't, doesn't, won't, is...",271,0.02075


In [15]:
# save dataframe with columns 'stem','words' and 'stem_freq' to csv file. For better display in csv file, 
# we remove unicode character "u" and brackets "[]".
def remove_unicode_brackes(item):
    item_remove_unicode = [str(i) for i in item]
    item_remove_unicode_brackets = str(item_remove_unicode).strip("[]")
    return item_remove_unicode_brackets
def save_csv(df, file_name):
    df_to_csv = df.copy()
    df_to_csv['words'] = df_to_csv['words'].apply(remove_unicode_brackes)
    df_to_csv.to_csv(file_name)

In [16]:
# for each stem we want to show example sentences for each word related to the stem.
subs_all = [subs_1, subs_2, subs_3]
def word_write_sentences(word, num, location = True):
    locations = word_loc_combined[word]
    for i in range(min(len(locations), num)):
        epi, loc = locations[i]
        subs = subs_all[epi-1]
        line = subs[loc].text
        start = subs[loc].start.to_time()
        end = subs[loc].end.to_time()
        file.write('Episode ' + str(epi) + "\n")
        if location:
            file.write(str(start)+ "\n")
            file.write(str(end)+ "\n")
        file.write(line)
        file.write("\n")
def stem_write_words_sentences(df,stem, num, location = True):
    row  = df[df['stem']== stem]
    words = row['words'].item()
    file.write("Stem: " + stem + "\n")
    file.write("Words: " + str(words)+ "\n")
    file.write("Example sentences: "+ "\n")
    for word in words:
        file.write("-" + word + ":"+ "\n")
        word_write_sentences(word, num, location)
    file.write("\n")
def freq_write_stems(df, sentence_num, stem_num, page_num, location = True):
    length = df.shape[0]
    if (page_num-1)*stem_num <= length:
        for i in range((page_num-1)*stem_num, min(page_num*stem_num,length)):
            file.write(str(i+1))
            file.write("\n")
            stem = df.loc[i]['stem']
            stem_freq = df.loc[i]['stem_freq']
            stem_freq_pct = df.loc[i]['stem_freq_pct']
            file.write(str(stem_freq) + '\n')
            file.write(str(stem_freq_pct) + '\n')
            stem_write_words_sentences(df,stem, sentence_num, location)
            file.write("----------------------------------------------------"+ "\n")

In [17]:
# choose first 20 stems with highest frequency
stem_1_to_20 = stem_words_freq_df_clean.iloc[:20].copy()
# export the above result into a csv file
save_csv(stem_1_to_20, 'result_first_20stems.csv')

In [18]:
# for the first 20 stems, print correponding words and several sentences
import sys
reload(sys)
sys.setdefaultencoding('utf8')
file = open("result_first_20stems_with_examples.txt","w") 
freq_write_stems(stem_1_to_20, 2, 20, 1, 1)
file.close() 