# Data Prep & Processing - Lemmatizing & Tokenizing

#### Set parameters for how output will be saved

In [1]:
data_directory = "../data/"

In [2]:
model_name_suffix = "weighted"
run_number = 1

In [3]:
tokenized_run_name = f"{model_name_suffix}_{run_number}_tokens"

#### Imports

In [4]:
# general use
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt

# data preparation for NLP / modeling
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
# from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# # for LDA modeling & presentation
# from sklearn.model_selection import GridSearchCV
# from sklearn.decomposition import LatentDirichletAllocation
# import pyLDAvis
# import pyLDAvis.sklearn
# pyLDAvis.enable_notebook()

# # for LSA modeling
# from sklearn.decomposition import TruncatedSVD

# for progress bar on loops
from tqdm import tqdm

#### Data Read In

In [16]:
merged_df = pd.read_json(f'{data_directory}merged_df.json')
merged_df.head()

Unnamed: 0,name,href,years,imdb_description,pg_rating,imdb_genre_tags,imdb_rating,num_votes,img_thumbnail,wiki_warning_flag,wiki_search_term,wiki_search_url,wiki_text
0,Game of Thrones,/title/tt0944947/,(2011–2019),Nine noble families fight for control over the...,TV-MA,"Action, Adventure, Drama",9.2,2148311,https://m.media-amazon.com/images/M/MV5BYTRiND...,1,Game of thrones (TV series),https://en.wikipedia.org/wiki/Game_of_thrones_...,Game of Thrones is an American fantasy drama t...
1,Prison Break,/title/tt0455275/,(2005–2017),"Due to a political conspiracy, an innocent man...",TV-14,"Action, Crime, Drama",8.3,548267,https://m.media-amazon.com/images/M/MV5BMTg3NT...,1,Prison break (TV series),https://en.wikipedia.org/wiki/Prison_break_(TV...,Prison Break is an American serial drama telev...
2,Vikings,/title/tt2306299/,(2013–2020),Vikings transports us to the brutal and myster...,TV-MA,"Action, Adventure, Drama",8.5,547494,https://m.media-amazon.com/images/M/MV5BODk4Zj...,0,Vikings (2013 TV series),https://en.wikipedia.org/wiki/Vikings_(2013_TV...,Vikings is a historical drama television serie...
3,The Boys,/title/tt1190634/,(2019– ),A group of vigilantes set out to take down cor...,TV-MA,"Action, Comedy, Crime",8.7,542317,https://m.media-amazon.com/images/M/MV5BOTEyND...,0,The Boys (2019 TV series),https://en.wikipedia.org/wiki/The_Boys_(2019_T...,The Boys is an American superhero television s...
4,The Mandalorian,/title/tt8111088/,(2019– ),The travels of a lone bounty hunter in the out...,TV-14,"Action, Adventure, Fantasy",8.7,527088,https://m.media-amazon.com/images/M/MV5BZjRlZD...,1,The Mandalorian (TV series),https://en.wikipedia.org/wiki/The_Mandalorian_...,The Mandalorian is an American space Western t...


In [17]:
merged_df.isna().sum()

name                 0
href                 0
years                0
imdb_description     0
pg_rating            0
imdb_genre_tags      0
imdb_rating          0
num_votes            0
img_thumbnail        0
wiki_search_term     2
wiki_search_url      2
wiki_text            2
dtype: int64

In [18]:
merged_df.fillna("",inplace=True)

#### Create separate DataFrame with only select columns to be used in lemmatizing/tokenizing

In [19]:
# weighting factors for pg_rating, imdb_genre_tags

pg_weighting_factor = 10            # =1 is unweighted
imdb_genre_weighting_factor = 20    # =1 is unweighted

In [20]:
# create copied DataFrame with only name and a combined column of text
df = merged_df[['name', 'href']].copy()
df['combined_text'] = (merged_df['imdb_description'] + " . " + \
                        (merged_df['pg_rating'].replace("-","") + ", ") * pg_weighting_factor + " . " + \
                        (merged_df['imdb_genre_tags'] + ", ") * imdb_genre_weighting_factor + " . " + \
                        merged_df['wiki_text']).\
                    replace('==', '').replace('\n', '.').replace("\\'s", "'s").replace("\'s", "'s")  #seems like some of these don't work?
df.head()

Unnamed: 0,name,href,combined_text
0,Game of Thrones,/title/tt0944947/,Nine noble families fight for control over the...
1,Prison Break,/title/tt0455275/,"Due to a political conspiracy, an innocent man..."
2,Vikings,/title/tt2306299/,Vikings transports us to the brutal and myster...
3,The Boys,/title/tt1190634/,A group of vigilantes set out to take down cor...
4,The Mandalorian,/title/tt8111088/,The travels of a lone bounty hunter in the out...


## Data prep - Text normalization (lemmatizing, tokenizing, stop words)

#### Lemmatizing

In [24]:
# Lemmatizing Helper Function (lemmatize individual words)
# Reference: General Assembly DSI Lesson 504-lesson-nlp-i

def custom_lemmatize(word, tag):
    mapper = {
        'J': wordnet.ADJ,
        'V': wordnet.VERB,
        'N': wordnet.NOUN,
        'R': wordnet.ADV
    }
    pos = mapper.get(tag[0])
    
    return wn.lemmatize(word, pos) if pos else word

In [25]:
# Lemmatizing 
# Reference: General Assembly DSI Lesson 504-lesson-nlp-i

def lemmatize_func(df_to_process, input_text_col='combined_text', output_col_name='lemmatized_text'):

    wn = WordNetLemmatizer()

    df_to_process[output_col_name] = ""

    for i in tqdm(range(df_to_process.shape[0])):
        df_to_process.loc[i, output_col_name] = \
            " ".join([custom_lemmatize(word, tag) for word, tag in nltk.pos_tag( df_to_process[input_text_col][i].split(" "))])
    
    return

In [26]:
lemmatize_func(df, 'combined_text', 'lemmatized_text')

100%|██████████| 2484/2484 [04:01<00:00, 10.27it/s]


#### Tokenizing & stop word removal

In [27]:
# Tokenizing & stopword removal AFTER lemmatizing
# Reference: General Assembly DSI Lesson 504-lesson-nlp-i

def tokenize_func(df_to_process, input_text_col='lemmatized_text', output_col_name='tokenized_text'):

    tokenizer = RegexpTokenizer('\w+')

    df_to_process[output_col_name] = ""

    for i in tqdm(range(df.shape[0])):
        my_text_tokens = tokenizer.tokenize(df_to_process[input_text_col][i].lower())
        my_text_tokens_ns = [token for token in my_text_tokens if token not in stopwords.words("english")]
        recombined_text = " ".join(my_text_tokens_ns)
        df_to_process.loc[i, output_col_name] = recombined_text
    
    return


In [28]:
tokenize_func(df, 'lemmatized_text', 'tokenized_text')

100%|██████████| 2484/2484 [17:43<00:00,  2.34it/s] 


#### Preview & save Dataframe with only tokenized_text

In [29]:
df.head(3)

Unnamed: 0,name,href,combined_text,lemmatized_text,tokenized_text
0,Game of Thrones,/title/tt0944947/,Nine noble families fight for control over the...,Nine noble family fight for control over the l...,nine noble family fight control land westeros ...
1,Prison Break,/title/tt0455275/,"Due to a political conspiracy, an innocent man...","Due to a political conspiracy, an innocent man...",due political conspiracy innocent man send dea...
2,Vikings,/title/tt2306299/,Vikings transports us to the brutal and myster...,Vikings transport us to the brutal and mysteri...,vikings transport us brutal mysterious world r...


In [30]:
columns_to_save = ['name', 'href', 'tokenized_text']

In [31]:
df[columns_to_save].to_csv(f"{data_directory}{tokenized_run_name}.csv", index=False)