In [1]:
import pandas as pd
import numpy as np
import json
import nltk
import re
import csv
import matplotlib.pyplot as plt 
import seaborn as sns
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

%matplotlib inline
pd.set_option('display.max_colwidth', 300)

In [2]:
df= pd.read_csv('occur_cleaned_data.csv', encoding ='utf-8')
df.head()

Unnamed: 0,title,id,genres,keywords
0,Four Rooms,5,"['Crime', 'Comedy']","['hotel', ""new year's ev"", 'witch', 'bet', 'hotel room', 'antholog', 'los angeles, california', 'hoodlum', 'woman director']"
1,Judgment Night,6,"['Action', 'Thriller', 'Crime']","['chicago, illinoi', 'drug deal', 'escap', 'one night', 'box']"
2,Star Wars,11,"['Adventure', 'Action', 'Science Fiction']","['android', 'galaxi', 'death star', 'lightsab', 'jedi', 'rescue miss', 'empir', 'rebellion', 'planet', 'the forc', 'space opera', 'stormtroop', 'totalitarian']"
3,Finding Nemo,12,"['Animation', 'Family']","['parent child relationship', 'computer anim', 'anthropomorph', 'underwat', 'shark', 'sea turtl', 'aftercreditssting', 'duringcreditssting', 'protective fath']"
4,Forrest Gump,13,"['Comedy', 'Drama', 'Romance']","['vietnam veteran', 'hippi', 'washington d.c.', 'mentally dis', 'based on novel or book', 'parent child relationship', 'vietnam war', 'friendship', 'love', 'family relationship', 'militari', 'wounded soldi', 'false histori', 'tragicomedi']"


In [3]:
df['genres'][2]

"['Adventure', 'Action', 'Science Fiction']"

In [10]:
df['genres'][2][0]

'['

In [4]:
processing_data = df[['genres','keywords']]
processing_data.head()

Unnamed: 0,genres,keywords
0,"['Crime', 'Comedy']","['hotel', ""new year's ev"", 'witch', 'bet', 'hotel room', 'antholog', 'los angeles, california', 'hoodlum', 'woman director']"
1,"['Action', 'Thriller', 'Crime']","['chicago, illinoi', 'drug deal', 'escap', 'one night', 'box']"
2,"['Adventure', 'Action', 'Science Fiction']","['android', 'galaxi', 'death star', 'lightsab', 'jedi', 'rescue miss', 'empir', 'rebellion', 'planet', 'the forc', 'space opera', 'stormtroop', 'totalitarian']"
3,"['Animation', 'Family']","['parent child relationship', 'computer anim', 'anthropomorph', 'underwat', 'shark', 'sea turtl', 'aftercreditssting', 'duringcreditssting', 'protective fath']"
4,"['Comedy', 'Drama', 'Romance']","['vietnam veteran', 'hippi', 'washington d.c.', 'mentally dis', 'based on novel or book', 'parent child relationship', 'vietnam war', 'friendship', 'love', 'family relationship', 'militari', 'wounded soldi', 'false histori', 'tragicomedi']"


## String processing. Genres in csv was stored as string instead of a list

Using ast to convert genres from string to list.

In [15]:
import ast

In [21]:
# find how many rows in data
processing_data.shape

(2932, 2)

In [16]:
# For loop on all the strings
for i in range(2932):
    processing_data['genres'][i] = ast.literal_eval(processing_data['genres'][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


In [17]:
processing_data.head()

Unnamed: 0,genres,keywords
0,"[Crime, Comedy]","['hotel', ""new year's ev"", 'witch', 'bet', 'hotel room', 'antholog', 'los angeles, california', 'hoodlum', 'woman director']"
1,"[Action, Thriller, Crime]","['chicago, illinoi', 'drug deal', 'escap', 'one night', 'box']"
2,"[Adventure, Action, Science Fiction]","['android', 'galaxi', 'death star', 'lightsab', 'jedi', 'rescue miss', 'empir', 'rebellion', 'planet', 'the forc', 'space opera', 'stormtroop', 'totalitarian']"
3,"[Animation, Family]","['parent child relationship', 'computer anim', 'anthropomorph', 'underwat', 'shark', 'sea turtl', 'aftercreditssting', 'duringcreditssting', 'protective fath']"
4,"[Comedy, Drama, Romance]","['vietnam veteran', 'hippi', 'washington d.c.', 'mentally dis', 'based on novel or book', 'parent child relationship', 'vietnam war', 'friendship', 'love', 'family relationship', 'militari', 'wounded soldi', 'false histori', 'tragicomedi']"


In [20]:
processing_data['genres'][2][1]

'Action'

## String processing on keywords. Remove whitespaces

In [22]:
# For loop on all the strings
for i in range(2932):
    processing_data['keywords'][i] = processing_data['keywords'][i].replace(" ", "")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


In [66]:
processing_data.head(100)

Unnamed: 0,genres,keywords
0,"[Crime, Comedy]","['hotel',""newyear'sev"",'witch','bet','hotelroom','antholog','losangeles,california','hoodlum','womandirector']"
1,"[Action, Thriller, Crime]","['chicago,illinoi','drugdeal','escap','onenight','box']"
2,"[Adventure, Action, Science Fiction]","['android','galaxi','deathstar','lightsab','jedi','rescuemiss','empir','rebellion','planet','theforc','spaceopera','stormtroop','totalitarian']"
3,"[Animation, Family]","['parentchildrelationship','computeranim','anthropomorph','underwat','shark','seaturtl','aftercreditssting','duringcreditssting','protectivefath']"
4,"[Comedy, Drama, Romance]","['vietnamveteran','hippi','washingtond.c.','mentallydis','basedonnovelorbook','parentchildrelationship','vietnamwar','friendship','love','familyrelationship','militari','woundedsoldi','falsehistori','tragicomedi']"
...,...,...
95,"[Crime, Action, Thriller]","['detect','dystopia','murder','twin','basedongraphicnovel','darkhorsecom','neo-noir']"
96,"[Drama, Thriller, Mystery]","['christian','librari','inquisit','monk','poison','secretpassag','basedonnovelorbook','middleag','persecut','mediev','burnedatthestak','murdermysteri','theologicaldeb','14thcenturi']"
97,"[Science Fiction, Action, Adventure, Thriller]","['lossoflovedon','android','death','cosmo','spanninggener']"
98,"[Adventure, Comedy, Science Fiction]","['railroadrobb','california','indianterritori','sportscar','inventor','locomot','saloon','horsebackrid','timetravel','outlaw','sequel','madscientist','nativeamerican','western','shootout','gunfight','train','cavalri','1950']"


## sanity check on the processed data

In [25]:
processing_data['genres'][56][1]

'Comedy'

In [26]:
processing_data['keywords'][56]

"['london,england','chocol','factorywork','basedonnovelorbook','parentchildrelationship','candi','overweightchild','grandparentgrandchildrelationship','teacher']"

In [7]:
#from sklearn.feature_extraction.text import CountVectorizer

In [8]:
#vectorizer = CountVectorizer()

# Tf-idf and Count vectorizor

### For the vectorizors to work the input should be a string or a text file. Hence we need to concatenate keywords in list into a string and store them. For keywords with 2 or more words, we will add in - to combine them.

In [27]:
from sklearn.preprocessing import MultiLabelBinarizer

multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(processing_data['genres'])

# transform target variable
y = multilabel_binarizer.transform(processing_data['genres'])

In [33]:
tfidf_vectorizer = TfidfVectorizer()

In [35]:
# split dataset into training and validation set
xtrain, xval, ytrain, yval = train_test_split(processing_data['keywords'], y, test_size=0.2, random_state=9)

In [36]:
ytrain

array([[0, 0, 0, ..., 0, 1, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [67]:
# create TF-IDF features
xtrain_tfidf = tfidf_vectorizer.fit_transform(xtrain)
xval_tfidf = tfidf_vectorizer.transform(xval)

In [68]:
from sklearn.linear_model import LogisticRegression

# Binary Relevance
from sklearn.multiclass import OneVsRestClassifier

# Performance metric
from sklearn.metrics import f1_score

In [69]:
lr = LogisticRegression()
clf = OneVsRestClassifier(lr)

In [70]:
# fit model on train data
clf.fit(xtrain_tfidf, ytrain)

OneVsRestClassifier(estimator=LogisticRegression())

In [71]:
# make predictions for validation set
y_pred = clf.predict(xval_tfidf)

In [72]:
y_pred[100]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0])

In [73]:
multilabel_binarizer.inverse_transform(y_pred)[0]

('Drama',)

In [74]:
# evaluate performance
f1_score(yval, y_pred, average="micro")

0.4022988505747126

## Advanced: Create inference function

In [75]:
def infer_tags(q):
    #q = clean_text(q)
    #q = remove_stopwords(q)
    q_vec = tfidf_vectorizer.transform([q])
    q_pred = clf.predict(q_vec)
    return multilabel_binarizer.inverse_transform(q_pred)

In [76]:
#for i in range(120):
    #print(multilabel_binarizer.inverse_transform(y_pred)[i])

('Drama',)
('Drama',)
('Drama',)
('Drama',)
('Crime', 'Drama', 'Thriller')
('Horror',)
()
('Crime', 'Mystery', 'Thriller')
('Romance',)
('Drama',)
('Romance',)
()
('Action', 'Adventure', 'Science Fiction')
('Comedy', 'Drama')
('Drama',)
()
()
('Thriller',)
('Comedy', 'Drama')
()
('Action', 'Science Fiction')
()
('Drama',)
('Horror', 'Thriller')
('Drama',)
()
()
()
()
()
()
()
('Drama',)
('Drama',)
('Action',)
('Comedy', 'Drama')
()
()
('Drama',)
('Comedy', 'Drama')
('Crime',)
('Action',)
()
('Comedy', 'Family')
('Comedy', 'Drama')
('Drama',)
('Action', 'Science Fiction', 'Thriller')
()
('Action', 'Crime', 'Thriller')
()
('Drama', 'War')
('Crime', 'Drama', 'Thriller')
('Drama',)
('Comedy',)
('Drama',)
()
()
('Comedy',)
('Drama',)
('Comedy',)
()
('Thriller',)
()
()
()
('Drama',)
('Comedy', 'Drama')
()
()
('Drama',)
('Comedy', 'Romance')
('Drama',)
()
('Adventure', 'Comedy')
('Comedy', 'Drama')
('Comedy',)
('Action', 'Adventure', 'Science Fiction')
('Action', 'Science Fiction')
('Comedy',

## Trying out other classifiers

In [79]:
 # Import Decision Tree Classifier model from Scikit-Learn
from sklearn.tree import DecisionTreeClassifier

# Create a Decision Tree Classifier object
dectree = DecisionTreeClassifier(max_depth = 2)

In [80]:
clf = OneVsRestClassifier(dectree)

In [81]:
# fit model on train data
clf.fit(xtrain_tfidf, ytrain)

OneVsRestClassifier(estimator=DecisionTreeClassifier(max_depth=2))

In [82]:
# make predictions for validation set
y_pred = clf.predict(xval_tfidf)

In [85]:
y_pred[100]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [86]:
multilabel_binarizer.inverse_transform(y_pred)[100]

()

In [87]:
# evaluate performance
f1_score(yval, y_pred, average="micro")

0.2205240174672489

In [88]:
for i in range(120):
    print(multilabel_binarizer.inverse_transform(y_pred)[i])

('Drama',)
('Thriller',)
()
()
('Thriller',)
('Horror',)
()
('Thriller',)
('Romance',)
('Mystery', 'Thriller')
('Romance',)
('Mystery',)
('Adventure', 'Science Fiction')
()
('Drama',)
()
('Mystery',)
('Crime',)
()
()
('Science Fiction',)
()
()
()
()
()
()
()
()
()
()
()
()
()
()
()
('Drama', 'Thriller')
()
()
()
('Mystery', 'Thriller')
()
()
()
()
()
('Adventure',)
()
()
('Thriller',)
('War',)
('Crime', 'Thriller')
('Drama',)
()
()
()
()
()
()
('Romance',)
()
()
()
()
('Thriller',)
('Thriller',)
()
()
()
()
('Romance',)
('Drama', 'History')
()
('Family',)
()
()
('Action', 'Adventure')
()
()
()
()
()
('Mystery', 'Thriller')
('Romance',)
()
('Crime', 'Thriller')
()
()
('War',)
()
('Comedy',)
()
()
()
('Comedy',)
()
()
()
()
()
()
()
()
()
()
('Animation', 'Comedy', 'Family')
()
('Comedy',)
()
('Drama',)
()
()
('Animation',)
()
('Drama',)
()
()
()
('Fantasy',)
('Adventure', 'Comedy', 'Mystery', 'Thriller')


## other classifiers

Havent try
Random Forest
SVN
Naive Bayes

Tried and gonna use
GRboost
DecisionTree




In [117]:
# GRboost
from sklearn.ensemble import GradientBoostingClassifier
GRboost = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0)

In [118]:
clf = OneVsRestClassifier(GRboost)

In [119]:
# fit model on train data
clf.fit(xtrain_tfidf, ytrain)

OneVsRestClassifier(estimator=GradientBoostingClassifier(learning_rate=1.0,
                                                         max_depth=1,
                                                         random_state=0))

In [120]:
# make predictions for validation set
y_pred = clf.predict(xval_tfidf)

In [121]:
# evaluate performance
f1_score(yval, y_pred, average="micro")

0.4865076261243644

In [122]:
for i in range(120):
    print(multilabel_binarizer.inverse_transform(y_pred)[i])

('Drama',)
('Drama',)
()
('Adventure',)
('Crime', 'Thriller')
('Horror',)
('Comedy', 'Fantasy')
('Action', 'Crime', 'Drama', 'Mystery', 'Thriller')
('Romance',)
('Crime', 'Mystery')
('Romance',)
('Action', 'Mystery', 'Thriller')
('Action', 'Adventure', 'Science Fiction')
()
('Drama',)
()
('Comedy', 'Mystery')
('Drama', 'Fantasy', 'Thriller')
('Comedy', 'Crime')
()
('Action', 'Animation', 'Science Fiction')
()
()
()
('Drama', 'Romance')
('Drama',)
()
()
('Action', 'Adventure', 'Romance')
('Animation', 'Drama')
('Horror',)
()
('Comedy', 'Crime')
('Thriller',)
('Action', 'Crime', 'Thriller')
('Drama',)
('Drama', 'History', 'Thriller')
()
('Adventure',)
('Drama',)
('Crime', 'Mystery', 'Thriller')
('Thriller',)
()
()
('Comedy', 'Drama')
('Drama',)
('Action', 'Adventure', 'Science Fiction', 'Thriller')
('Action', 'Crime')
('Crime', 'Drama', 'Thriller')
('Drama',)
('Action', 'Romance')
('Crime', 'Drama', 'Thriller')
('Comedy', 'Drama', 'Mystery')
('Comedy',)
('Drama',)
('Action',)
('Action', 

## Review

The csv file given with keywords and genres are in Strings not lists like wtf
Need to convert genre into a list
https://stackoverflow.com/questions/1894269/how-to-convert-string-representation-of-list-to-a-list