In [1]:
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split

pd.set_option('display.max_colwidth', 120)

In [2]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13164 entries, 0 to 13163
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Release Year      13164 non-null  int64 
 1   Title             13164 non-null  object
 2   Origin/Ethnicity  13164 non-null  object
 3   Director          13164 non-null  object
 4   Cast              12980 non-null  object
 5   Genre             13164 non-null  object
 6   Wiki Page         13164 non-null  object
 7   Plot              13164 non-null  object
dtypes: int64(1), object(7)
memory usage: 822.9+ KB


In [4]:
train_df.describe()

Unnamed: 0,Release Year
count,13164.0
mean,1977.190064
std,28.418616
min,1903.0
25%,1953.0
50%,1983.0
75%,2004.0
max,2017.0


In [5]:
train_df['Genre'].value_counts()

drama        4781
comedy       3503
horror        937
action        890
thriller      768
romance       743
western       678
crime         442
adventure     422
Name: Genre, dtype: int64

In [6]:
train_df.sample(10)

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
6257,1978,The Stud,British,Quentin Masters,"Joan Collins, Oliver Tobias",drama,https://en.wikipedia.org/wiki/The_Stud_(film),"Fontaine Khaled is the London wife of a wealthy but boring businessman. She spends his money on her nightclub, Hobo,..."
7903,1945,Bring on the Girls,American,Sidney Lanfield,"Veronica Lake, Marjorie Reynolds",comedy,https://en.wikipedia.org/wiki/Bring_on_the_Girls_(film),Wealthy J. Newport Bates breaks off an engagement after discovering his fiancee is a gold digger. He joins the Navy ...
12880,2013,Mahapurush O Kapurush,Bengali,Aniket Chattopadhyay,"Bratya Basu, Dipankar De, Locket Chatterjee, Bhola Tamang, Tanima Sen, Ritwick Chakraborty, Biswanath Basu, Sujoy Pr...",comedy,https://en.wikipedia.org/wiki/Mahapurush_O_Kapurush,"Bireshwar Chatterjee (Dipankar De), a wealthy industrialist, is a very happy man. The reason for his happiness is th..."
3756,1944,The Big Noise,American,Malcolm St. Clair,"Laurel and Hardy, Doris Merrick",comedy,https://en.wikipedia.org/wiki/The_Big_Noise_(1944_film),"While cleaning the office of a detective agency, janitors Laurel and Hardy answer a telephone call from an inventor ..."
12794,1960,The Savage Innocents,American,Nicholas Ray,"Anthony Quinn, Peter O'Toole, Yoko Tani",drama,https://en.wikipedia.org/wiki/The_Savage_Innocents,"Inuk, an Inuk, kills a priest who rejects his traditional offer of food and his wife's company. Pursued by white pol..."
9679,2014,Bhakarkhadi 7 km,Marathi,Umesh Namjoshi,"Aniket Vishwasrao, Veena Jamakar, Apurva Nemalekar",drama,https://en.wikipedia.org/wiki/Bhakarkhadi_7_km,Bhakharkhadi 7 km tells the story of a young doctor who aspires to a career as a surgeon in America. His dreams are ...
1325,1956,The Sharkfighters,American,Jerry Hopper,"Victor Mature, Karen Steele",adventure,https://en.wikipedia.org/wiki/The_Sharkfighters,"In August 1943, Lt. Commander Ben Staves (Mature), recovering from the sinking of his destroyer in battle and the lo..."
2080,1943,The Crystal Ball,American,Elliott Nugent,"Paulette Goddard, Ray Milland",comedy,https://en.wikipedia.org/wiki/The_Crystal_Ball_(film),"A maid, in cahoots with Madame Zenobia (Gladys George), a fake psychic, fools Jo Ainsly (Virginia Field) into believ..."
3325,1968,All Neat in Black Stockings,British,Christopher Morahan,"Victor Henry, Susan George",comedy,https://en.wikipedia.org/wiki/All_Neat_in_Black_Stockings,"Ginger (Victor Henry) is a window washer with an eye for the girls. His best friend and neighbor, Dwyer, (Jack Sheph..."
4920,2014,Devil's Due,American,Matt Bettinelli-Olpin Tyler Gillett,Zach Gilford\r\nAllison Miller\r\nSam Anderson\r\nAimee Carrero,horror,https://en.wikipedia.org/wiki/Devil%27s_Due_(film),"A young couple, Zach and Samantha McCall, are about to get married when Zach decides he wants to document their life..."


# Basic model

In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [8]:
pipeline_1 = Pipeline([
                     ('vectorizer' , CountVectorizer()),
                     ('classifier' , LogisticRegression(max_iter=3000,multi_class='ovr'))
                    ])
pipeline_2 = Pipeline([
                     ('vectorizer' , CountVectorizer()),
                     ('classifier' , LogisticRegression(max_iter=3000,multi_class='multinomial'))
                    ])

parameters = {'vectorizer__max_df' : [.6,.8],
              'vectorizer__min_df' : [.01, .02, 0.05],
              'vectorizer__ngram_range' : [(1,1), (1,2)],
               }

In [9]:
grid_search_1 = GridSearchCV(pipeline_1,
                           parameters,
                           n_jobs = -1,
                           cv = 5,
                           verbose = 1)

grid_search_2 = GridSearchCV(pipeline_2,
                           parameters,
                           n_jobs = -1,
                           cv = 5,
                           verbose = 1)

In [10]:
grid_search_1.fit(train_df['Plot'],train_df['Genre'])

Fitting 5 folds for each of 12 candidates, totalling 60 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vectorizer', CountVectorizer()),
                                       ('classifier',
                                        LogisticRegression(max_iter=3000,
                                                           multi_class='ovr'))]),
             n_jobs=-1,
             param_grid={'vectorizer__max_df': [0.6, 0.8],
                         'vectorizer__min_df': [0.01, 0.02, 0.05],
                         'vectorizer__ngram_range': [(1, 1), (1, 2)]},
             verbose=1)

In [11]:
grid_search_2.fit(train_df['Plot'],train_df['Genre'])

Fitting 5 folds for each of 12 candidates, totalling 60 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vectorizer', CountVectorizer()),
                                       ('classifier',
                                        LogisticRegression(max_iter=3000,
                                                           multi_class='multinomial'))]),
             n_jobs=-1,
             param_grid={'vectorizer__max_df': [0.6, 0.8],
                         'vectorizer__min_df': [0.01, 0.02, 0.05],
                         'vectorizer__ngram_range': [(1, 1), (1, 2)]},
             verbose=1)

In [12]:
grid_search_1.best_estimator_

Pipeline(steps=[('vectorizer', CountVectorizer(max_df=0.8, min_df=0.01)),
                ('classifier',
                 LogisticRegression(max_iter=3000, multi_class='ovr'))])

In [13]:
grid_search_2.best_estimator_

Pipeline(steps=[('vectorizer',
                 CountVectorizer(max_df=0.8, min_df=0.01, ngram_range=(1, 2))),
                ('classifier',
                 LogisticRegression(max_iter=3000, multi_class='multinomial'))])

In [17]:
# Create predictions
best_pipe_1 = grid_search_1.best_estimator_
best_pipe_2 = grid_search_2.best_estimator_

lr_prediction_1 = best_pipe_1.predict(test_df['Plot'])
lr_prediction_2 = best_pipe_2.predict(test_df['Plot'])

lr_1_predictions = pd.DataFrame(lr_prediction_1,columns=['Predicted'])
lr_2_predictions = pd.DataFrame(lr_prediction_2,columns=['Predicted'])
lr_1_predictions['Id'] = test_df['Id']
lr_2_predictions['Id'] = test_df['Id']

lr_1_predictions.to_csv("lr_1_predictions.csv",index = False)
lr_2_predictions.to_csv("lr_2_predictions.csv",index = False)

# Improved model utilizing stop words, lemmatization and TfidfVectorizer

In [18]:
import spacy.cli
nlp = spacy.load("en_core_web_md")

In [19]:
# https://stackoverflow.com/questions/62712963/using-spacy-to-lemmatize-a-column-of-parsed-html-text-in-a-pandas-dataframe
train_df['lemmatized'] = train_df['Plot'].apply(lambda x: " ".join([y.lemma_ for y in nlp(x)]))

In [20]:
# Define function to lemmatize to apply on other DFs
def lemmatize_plot(df):
    df['lemmatized'] = df['Plot'].apply(lambda x: " ".join([y.lemma_ for y in nlp(x)]))
    return df

In [21]:
stop_words_list = ['a','now', 'in', 'on', 'to', 'has', 'about',
                   'for', 'that', 'by', 'from', 'an' , 'or', 'as']
test_df = lemmatize_plot(test_df)

In [22]:
# We will use tfidfvectorizer as well as incorporating stop word lists and lemmatized text
pipeline_1_lemma = Pipeline([
                     ('vectorizer' , TfidfVectorizer(stop_words=stop_words_list)),
                     ('classifier' , LogisticRegression(max_iter=3000,multi_class='ovr'))
                    ])
pipeline_2_lemma = Pipeline([
                     ('vectorizer' , TfidfVectorizer(stop_words=stop_words_list)),
                     ('classifier' , LogisticRegression(max_iter=3000,multi_class='multinomial'))
                    ])

parameters = {'vectorizer__max_df' : [.6,.8],
              'vectorizer__min_df' : [.01, .02, 0.05],
              'vectorizer__ngram_range' : [(1,1), (1,2)],
               }

In [23]:
grid_search_1_lemma = GridSearchCV(pipeline_1_lemma,
                           parameters,
                           n_jobs = -1,
                           cv = 5,
                           verbose = 1)

grid_search_2_lemma = GridSearchCV(pipeline_2_lemma,
                           parameters,
                           n_jobs = -1,
                           cv = 5,
                           verbose = 1)

In [24]:
grid_search_1_lemma.fit(train_df['lemmatized'],train_df['Genre'])

Fitting 5 folds for each of 12 candidates, totalling 60 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vectorizer',
                                        TfidfVectorizer(stop_words=['a', 'now',
                                                                    'in', 'on',
                                                                    'to', 'has',
                                                                    'about',
                                                                    'for',
                                                                    'that',
                                                                    'by',
                                                                    'from',
                                                                    'an', 'or',
                                                                    'as'])),
                                       ('classifier',
                                        LogisticRegression(max_iter=3000,
                       

In [25]:
grid_search_2_lemma.fit(train_df['lemmatized'],train_df['Genre'])

Fitting 5 folds for each of 12 candidates, totalling 60 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vectorizer',
                                        TfidfVectorizer(stop_words=['a', 'now',
                                                                    'in', 'on',
                                                                    'to', 'has',
                                                                    'about',
                                                                    'for',
                                                                    'that',
                                                                    'by',
                                                                    'from',
                                                                    'an', 'or',
                                                                    'as'])),
                                       ('classifier',
                                        LogisticRegression(max_iter=3000,
                       

In [26]:
best_pipe_1_lemma = grid_search_1_lemma.best_estimator_
best_pipe_2_lemma = grid_search_2_lemma.best_estimator_

In [27]:
# Create predictions
lr_prediction_1_lemma = best_pipe_1_lemma.predict(test_df['lemmatized'])
lr_prediction_2_lemma = best_pipe_2_lemma.predict(test_df['lemmatized'])

lr_1_predictions_lemma = pd.DataFrame(lr_prediction_1_lemma,columns=['Predicted'])
lr_2_predictions_lemma = pd.DataFrame(lr_prediction_2_lemma,columns=['Predicted'])

lr_1_predictions_lemma['Id'] = test_df['Id']
lr_2_predictions_lemma['Id'] = test_df['Id']
lr_1_predictions_lemma.to_csv("lr_1_predictions_lemma.csv",index = False)
lr_2_predictions_lemma.to_csv("lr_2_predictions_lemma.csv",index = False)

# Other Anaylses

## KNN

In [29]:
from sklearn.neighbors import KNeighborsClassifier

In [30]:
# KNN lemmatized
# redefine parameters for knn (same as before)

pipeline_3_lemma = Pipeline([
                     ('vectorizer' , TfidfVectorizer(stop_words=stop_words_list)),
                     ('classifier' , KNeighborsClassifier())
                    ])

parameters = {'classifier__n_neighbors' : [15,35,57],
              'classifier__weights': ['distance', 'uniform'],
              'vectorizer__max_df' : [.6,.8],
              'vectorizer__min_df' : [.01,.05],
              'vectorizer__ngram_range': [(1,1),(1,2)]}

grid_knn_lemma = GridSearchCV(pipeline_3_lemma, 
                    parameters, 
                    cv = 3,
                    n_jobs = -1,
                    verbose = 1)

In [31]:
grid_knn_lemma.fit(train_df['lemmatized'],train_df['Genre'])

Fitting 3 folds for each of 48 candidates, totalling 144 fits


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('vectorizer',
                                        TfidfVectorizer(stop_words=['a', 'now',
                                                                    'in', 'on',
                                                                    'to', 'has',
                                                                    'about',
                                                                    'for',
                                                                    'that',
                                                                    'by',
                                                                    'from',
                                                                    'an', 'or',
                                                                    'as'])),
                                       ('classifier', KNeighborsClassifier())]),
             n_jobs=-1,
             param_grid={'classifier__n_neighb

In [32]:
best_pipe_knn_lemma = grid_knn_lemma.best_estimator_

In [33]:
knn_prediction_lemma =  best_pipe_knn_lemma.predict(test_df['lemmatized'])
knn_predictions_lemma = pd.DataFrame(knn_prediction_lemma,columns=['Predicted'])
knn_predictions_lemma['Id'] = test_df['Id']
knn_predictions_lemma.to_csv("knn_predictions_lemma.csv",index=False)

## Topic Modelling

In [34]:
from sklearn.decomposition import LatentDirichletAllocation

In [35]:
vectorizer_topicmodel = CountVectorizer(lowercase   = True,
                             ngram_range = (1,1),
                             max_df      = .80,
                             stop_words  = stop_words_list,
                             min_df      = .01,
                             max_features = None)

lda_model = LatentDirichletAllocation(n_components   = 8,
                                      max_iter       = 50,
                                      evaluate_every = 5,
                                      verbose = 1
                                     )

In [36]:
vectorizer_topicmodel.fit(train_df['lemmatized'])
review_tf = vectorizer_topicmodel.transform(train_df['lemmatized'])
review_testdf = vectorizer_topicmodel.transform(test_df['lemmatized'])

In [37]:
lda_model.fit(review_tf)

iteration: 1 of max_iter: 50
iteration: 2 of max_iter: 50
iteration: 3 of max_iter: 50
iteration: 4 of max_iter: 50
iteration: 5 of max_iter: 50, perplexity: 845.5650
iteration: 6 of max_iter: 50
iteration: 7 of max_iter: 50
iteration: 8 of max_iter: 50
iteration: 9 of max_iter: 50
iteration: 10 of max_iter: 50, perplexity: 831.3062
iteration: 11 of max_iter: 50
iteration: 12 of max_iter: 50
iteration: 13 of max_iter: 50
iteration: 14 of max_iter: 50
iteration: 15 of max_iter: 50, perplexity: 825.9670
iteration: 16 of max_iter: 50
iteration: 17 of max_iter: 50
iteration: 18 of max_iter: 50
iteration: 19 of max_iter: 50
iteration: 20 of max_iter: 50, perplexity: 823.5243
iteration: 21 of max_iter: 50
iteration: 22 of max_iter: 50
iteration: 23 of max_iter: 50
iteration: 24 of max_iter: 50
iteration: 25 of max_iter: 50, perplexity: 822.3248
iteration: 26 of max_iter: 50
iteration: 27 of max_iter: 50
iteration: 28 of max_iter: 50
iteration: 29 of max_iter: 50
iteration: 30 of max_iter: 50

LatentDirichletAllocation(evaluate_every=5, max_iter=50, n_components=8,
                          verbose=1)

In [38]:
# Now we use our topics as features

In [39]:
lr_classifier_count = LogisticRegression(solver = 'lbfgs', max_iter= 5000)
topics_train = lda_model.transform(review_tf)
lr_classifier_count.fit(topics_train, train_df['Genre'])

LogisticRegression(max_iter=5000)

In [40]:
topics_test = lda_model.transform(review_testdf)

In [41]:
prediction_topic = lr_classifier_count.predict(topics_test)
prediction_topic_df = pd.DataFrame(prediction_topic,columns=['Predicted'])
prediction_topic_df['Id'] = test_df['Id']
prediction_topic_df.to_csv("prediction_topicmodelling.csv",index = False)