# Data Prep & Processing - Lemmatizing & Tokenizing

#### Set parameters for how output will be saved

In [1]:
data_directory = "../data/"

In [2]:
model_name_suffix = "weighted_nowiki"
run_number = 1

In [3]:
tokenized_run_name = f"{model_name_suffix}_{run_number}_tokens"

#### Imports

In [4]:
# general use
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt

# data preparation for NLP / modeling
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
# from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# # for LDA modeling & presentation
# from sklearn.model_selection import GridSearchCV
# from sklearn.decomposition import LatentDirichletAllocation
# import pyLDAvis
# import pyLDAvis.sklearn
# pyLDAvis.enable_notebook()

# # for LSA modeling
# from sklearn.decomposition import TruncatedSVD

# for progress bar on loops
from tqdm import tqdm

#### Data Read In

In [5]:
merged_df = pd.read_json(f'{data_directory}merged_df.json')
merged_df.head()

Unnamed: 0,name,href,years,imdb_description,pg_rating,imdb_genre_tags,imdb_rating,num_votes,img_thumbnail,wiki_search_term,...,tmdb_vote_count,first_air_date,tmdb_adult_content,tmdb_poster_path,tmdb_overview,tmdb_tagline,tmdb_genres,tv_networks,tmdb_keywords,is_anime
0,Game of Thrones,/title/tt0944947/,(2011–2019),Nine noble families fight for control over the...,TV-MA,"Action, Adventure, Drama",9.2,2148311,https://m.media-amazon.com/images/M/MV5BYTRiND...,Game of thrones (TV series),...,20934.0,2011-04-17,False,/7WUHnWGx5OO145IRxPDUkQSh4C7.jpg,Seven noble families fight for control of the ...,Winter Is Coming,"Sci-Fi & Fantasy, Drama, Action & Adventure",HBO,"based on novel or book, kingdom, dragon, king,...",False
1,Prison Break,/title/tt0455275/,(2005–2017),"Due to a political conspiracy, an innocent man...",TV-14,"Action, Crime, Drama",8.3,548267,https://m.media-amazon.com/images/M/MV5BMTg3NT...,Prison break (TV series),...,4269.0,2005-08-29,False,/ux7OfhhrXO4FzJtuew18ShiBLq7.jpg,"Due to a political conspiracy, an innocent man...",Break in. Break out. Save your brother's life.,"Action & Adventure, Crime, Drama",FOX,"prison, prisoner, escape, brother, fugitive, c...",False
2,Vikings,/title/tt2306299/,(2013–2020),Vikings transports us to the brutal and myster...,TV-MA,"Action, Adventure, Drama",8.5,547494,https://m.media-amazon.com/images/M/MV5BODk4Zj...,Vikings (2013 TV series),...,6117.0,2013-03-03,False,/bQLrHIRNEkE3PdIWQrZHynQZazu.jpg,"The adventures of Ragnar Lothbrok, the greates...",,"Action & Adventure, Drama, War & Politics","History, Amazon","scandinavia, viking, historical fiction, middl...",False
3,The Boys,/title/tt1190634/,(2019– ),A group of vigilantes set out to take down cor...,TV-MA,"Action, Comedy, Crime",8.7,542317,https://m.media-amazon.com/images/M/MV5BOTEyND...,The Boys (2019 TV series),...,8214.0,2019-07-25,False,/ut4PhX7OP2l2oJhrIUYWnt9pnQU.jpg,A group of vigilantes known informally as The ...,Never meet your heroes.,"Sci-Fi & Fantasy, Action & Adventure",Amazon,"superhero, gore, based on comic, revenge, nudity",False
4,The Mandalorian,/title/tt8111088/,(2019– ),The travels of a lone bounty hunter in the out...,TV-14,"Action, Adventure, Fantasy",8.7,527088,https://m.media-amazon.com/images/M/MV5BZjRlZD...,The Mandalorian (TV series),...,8792.0,2019-11-12,False,/eU1i6eHXlzMOlEq0ku1Rzq7Y4wA.jpg,"After the fall of the Galactic Empire, lawless...",Bounty hunting is a complicated profession.,"Sci-Fi & Fantasy, Action & Adventure, Drama",Disney+,"bounty hunter, space western, space opera, spa...",False


In [6]:
merged_df.isna().sum()

name                  0
href                  0
years                 0
imdb_description      0
pg_rating             0
imdb_genre_tags       0
imdb_rating           0
num_votes             0
img_thumbnail         0
wiki_search_term      0
wiki_search_url       0
wiki_text             0
tmdb_id               0
tmdb_name             0
original_name         0
original_language     0
origin_country        0
tmdb_popularity       0
tmdb_vote_average     0
tmdb_vote_count       0
first_air_date        0
tmdb_adult_content    0
tmdb_poster_path      0
tmdb_overview         0
tmdb_tagline          0
tmdb_genres           0
tv_networks           0
tmdb_keywords         0
is_anime              0
dtype: int64

In [7]:
merged_df.fillna("",inplace=True)

#### Create separate DataFrame with only select columns to be used in lemmatizing/tokenizing

In [8]:
# custom weighting factors

pg_weighting_factor = 10            # =1 is unweighted
imdb_genre_weighting_factor = 10    # =1 is unweighted
tmdb_overview_weighting_factor = 1    # =1 is unweighted
tmdb_tagline_weighting_factor = 1    # =1 is unweighted
tmdb_keywords_weighting_factor = 10    # =1 is unweighted

In [9]:
# create copied DataFrame with only name and a combined column of text
df = merged_df[['name', 'href']].copy()
df['combined_text'] = (merged_df['imdb_description'] + " . " + \
                        (merged_df['pg_rating'].replace("-","") + ", ") * pg_weighting_factor + " . " + \
                        (merged_df['imdb_genre_tags'] + ", ") * imdb_genre_weighting_factor + " . " + \
                        (merged_df['tmdb_overview'] + ", ") * tmdb_overview_weighting_factor + " . " + \
                        (merged_df['tmdb_tagline'] + ", ") * tmdb_tagline_weighting_factor + " . " + \
                        (merged_df['tmdb_keywords'] + ", ") * tmdb_keywords_weighting_factor + " . ").\
                    replace('==', '').replace('\n', '.').replace("\\'s", "'s").replace("\'s", "'s")  #seems like some of these don't work?
df.head()

Unnamed: 0,name,href,combined_text
0,Game of Thrones,/title/tt0944947/,Nine noble families fight for control over the...
1,Prison Break,/title/tt0455275/,"Due to a political conspiracy, an innocent man..."
2,Vikings,/title/tt2306299/,Vikings transports us to the brutal and myster...
3,The Boys,/title/tt1190634/,A group of vigilantes set out to take down cor...
4,The Mandalorian,/title/tt8111088/,The travels of a lone bounty hunter in the out...


## Data prep - Text normalization (lemmatizing, tokenizing, stop words)

#### Lemmatizing

In [10]:
# Lemmatizing Helper Function (lemmatize individual words)
# Reference: General Assembly DSI Lesson 504-lesson-nlp-i

def custom_lemmatize(word, tag):
    
    wn = WordNetLemmatizer()
    
    mapper = {
        'J': wordnet.ADJ,
        'V': wordnet.VERB,
        'N': wordnet.NOUN,
        'R': wordnet.ADV
    }
    pos = mapper.get(tag[0])
    
    return wn.lemmatize(word, pos) if pos else word

In [11]:
# Lemmatizing 
# Reference: General Assembly DSI Lesson 504-lesson-nlp-i

def lemmatize_func(df_to_process, input_text_col='combined_text', output_col_name='lemmatized_text'):

    df_to_process[output_col_name] = ""

    for i in tqdm(range(df_to_process.shape[0])):
        df_to_process.loc[i, output_col_name] = \
            " ".join([custom_lemmatize(word, tag) for word, tag in nltk.pos_tag( df_to_process[input_text_col][i].split(" "))])
    
    return

In [12]:
lemmatize_func(df, 'combined_text', 'lemmatized_text')

100%|██████████| 2484/2484 [00:25<00:00, 98.18it/s] 


#### Tokenizing & stop word removal

In [13]:
# Tokenizing & stopword removal AFTER lemmatizing
# Reference: General Assembly DSI Lesson 504-lesson-nlp-i

def tokenize_func(df_to_process, input_text_col='lemmatized_text', output_col_name='tokenized_text'):

    tokenizer = RegexpTokenizer('\w+')

    df_to_process[output_col_name] = ""

    for i in tqdm(range(df.shape[0])):
        my_text_tokens = tokenizer.tokenize(df_to_process[input_text_col][i].lower())
        my_text_tokens_ns = [token for token in my_text_tokens if token not in stopwords.words("english")]
        recombined_text = " ".join(my_text_tokens_ns)
        df_to_process.loc[i, output_col_name] = recombined_text
    
    return


In [14]:
tokenize_func(df, 'lemmatized_text', 'tokenized_text')

100%|██████████| 2484/2484 [01:13<00:00, 33.93it/s]


#### Preview & save Dataframe with only tokenized_text

In [15]:
df.head(3)

Unnamed: 0,name,href,combined_text,lemmatized_text,tokenized_text
0,Game of Thrones,/title/tt0944947/,Nine noble families fight for control over the...,Nine noble family fight for control over the l...,nine noble family fight control land westeros ...
1,Prison Break,/title/tt0455275/,"Due to a political conspiracy, an innocent man...","Due to a political conspiracy, an innocent man...",due political conspiracy innocent man send dea...
2,Vikings,/title/tt2306299/,Vikings transports us to the brutal and myster...,Vikings transport us to the brutal and mysteri...,vikings transport us brutal mysterious world r...


In [16]:
columns_to_save = ['name', 'href', 'tokenized_text']

In [17]:
df[columns_to_save].to_csv(f"{data_directory}{tokenized_run_name}.csv", index=False)