# Movie Classification

## 1. Web Scrapping

ทำการ Scrape โดยเลือกประเภทหนังเป็น Feature Film ในประเทศอเมริกา ตั้งแต่ปี 2018 ถึง 2021

In [3]:
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [2]:
page_movie = []

for i in [2018, 2019, 2020, 2021]:
    for j in range(1, 4200, 100):
        url = "https://www.imdb.com/search/title/?title_type=feature&release_date="+str(i)+"-01-01,"+str(i)+"-12-31&countries=us&sort=alpha,asc&count=100&start="+str(j)+"&ref_=adv_nxt"
        response = get(url)
        html_soup = BeautifulSoup(response.text, 'html.parser')
        movie_containers = html_soup.find_all('div', class_ = 'lister-item mode-advanced')

        names = []
        years = []
        genres = []
        descriptions = []

        for container in movie_containers:

            if (container.find('span', attrs = {'class': 'genre'}) is not None) and ((container.find_all('p', attrs={'class':'text-muted'})[-1].text) != '\nAdd a Plot\n'):

                name = container.h3.a.text
                names.append(name)

                year = container.h3.find('span', class_ = 'lister-item-year').text
                years.append(year)

                genre = container.find('span', attrs = {'class': 'genre'}).text
                genres.append(genre)

                description = container.find_all('p', attrs={'class':'text-muted'})[-1].text
                descriptions.append(description)
        test_df = pd.DataFrame({'movie': names,
                           'year': years,
                           'genres': genres,
                           'descriptions': descriptions})
        page_movie.append(test_df)

In [3]:
page_movie[0]

Unnamed: 0,movie,year,genres,descriptions
0,#1915House,(2018),"\nHorror, Thriller",\nA century of secrets are hidden behind the f...
1,#SquadGoals,(2018),"\nDrama, Thriller","\nSamantha is a serious journalist, both for t..."
2,#Victoria,(2018),"\nAction, Drama",\nVictoria moves from out of town into the poo...
3,1 Angry Black Man,(2018),\nDrama,\nMike Anderson is a senior at the quintessent...
4,"1 Dead man, 6 Suspects, $65 Million",(2018),\nMystery,"\nA low budget but slick ""who done it"" involvi..."
...,...,...,...,...
82,A Motion Selfie,(2018),"\nBiography, Comedy, Drama","\n""A Motion Selfie"" is one-of-a-kind DIY filmm..."
83,A Murder of Innocence,(2018),\nCrime,"\nBased on a true story, a community is rocked..."
84,A Natural Disaster,(2018),\nDrama,\n24 hours after an earthquake shakes Los Ange...
85,A Nice Quiet Life,(2018),"\nDrama, Romance",\nTwo college students fall in love one year b...


In [4]:
df = pd.concat(page_movie).reset_index(drop=True)
df

Unnamed: 0,movie,year,genres,descriptions
0,#1915House,(2018),"\nHorror, Thriller",\nA century of secrets are hidden behind the f...
1,#SquadGoals,(2018),"\nDrama, Thriller","\nSamantha is a serious journalist, both for t..."
2,#Victoria,(2018),"\nAction, Drama",\nVictoria moves from out of town into the poo...
3,1 Angry Black Man,(2018),\nDrama,\nMike Anderson is a senior at the quintessent...
4,"1 Dead man, 6 Suspects, $65 Million",(2018),\nMystery,"\nA low budget but slick ""who done it"" involvi..."
...,...,...,...,...
12369,Zo in Exile,(2021),\nFantasy,\nZo and friends venture off for a weekend get...
12370,Zone Drifter,(2021),\nSci-Fi,"\nIn the distant future, a former soldier batt..."
12371,Zooey,(2021),"\nComedy, Drama, Sci-Fi","\nWhen a young boy, Noah, meets Zooey, a young..."
12372,Zwtral,(2021),\nMystery,\nMaking trades is his life. Justin finds hims...


## 2. Data Preparation

### year

In [5]:
df.year.unique()

array(['(2018)', '(I) (2018)', '', '(II) (2018)', '(III) (2018)',
       '(IV) (2018)', '(V) (2018)', '(IX) (2018)', '(XXIII)',
       '(VI) (2018)', '(VIII) (2018)', '(2019)', '(VII)',
       '(XVIII) (2018)', '(I)', '(II)', '(XVI) (2018)', '(II) (2019)',
       '(I) (2019)', '(IV) (2019)', '(V) (2019)', '(III) (2019)',
       '(VI) (2019)', '(VIII) (2019)', '(XXXV) (2019)', '(VII) (2019)',
       '(X) (2019)', '(IX) (2019)', '(XII) (2019)', '(2020)',
       '(III) (2020)', '(II) (2020)', '(I) (2020)', '(XXI) (2020)',
       '(V) (2020)', '(VI) (2020)', '(IV) (2020)', '(2021)',
       '(VIII) (2020)', '(XI) (2020)', '(I) (2021)', '(II) (2021)',
       '(IV) (2021)', '(XII) (2021)', '(III) (2021)', '(VI) (2021)',
       '(V) (2021)', '(VI)'], dtype=object)

In [6]:
df.year = df.year.str.extract('(\d+)')
df.year = df.year.ffill(axis = 0)
df.year.unique()

array(['2018', '2019', '2020', '2021'], dtype=object)

In [7]:
df.groupby(['year']).size().reset_index(name='count')

Unnamed: 0,year,count
0,2018,3359
1,2019,2907
2,2020,2929
3,2021,3179


In [8]:
df.genres = df.genres.str.strip()
df.head()

Unnamed: 0,movie,year,genres,descriptions
0,#1915House,2018,"Horror, Thriller",\nA century of secrets are hidden behind the f...
1,#SquadGoals,2018,"Drama, Thriller","\nSamantha is a serious journalist, both for t..."
2,#Victoria,2018,"Action, Drama",\nVictoria moves from out of town into the poo...
3,1 Angry Black Man,2018,Drama,\nMike Anderson is a senior at the quintessent...
4,"1 Dead man, 6 Suspects, $65 Million",2018,Mystery,"\nA low budget but slick ""who done it"" involvi..."


In [81]:
#df.to_pickle('df.pickle')
df = pd.read_pickle('df.pickle')
df.head()

Unnamed: 0,movie,year,genres,descriptions
0,#1915House,2018,"Horror, Thriller",\nA century of secrets are hidden behind the f...
1,#SquadGoals,2018,"Drama, Thriller","\nSamantha is a serious journalist, both for t..."
2,#Victoria,2018,"Action, Drama",\nVictoria moves from out of town into the poo...
3,1 Angry Black Man,2018,Drama,\nMike Anderson is a senior at the quintessent...
4,"1 Dead man, 6 Suspects, $65 Million",2018,Mystery,"\nA low budget but slick ""who done it"" involvi..."


In [7]:
import nltk
from nltk.stem.porter import *
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import *

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import pickle

In [6]:
all_genres = df.genres.tolist()
all_genres = ", ".join(all_genres)
all_genres = sorted(list(set(all_genres.split(", "))))
all_genres

['Action',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Drama',
 'Family',
 'Fantasy',
 'Game-Show',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'News',
 'Reality-TV',
 'Romance',
 'Sci-Fi',
 'Sport',
 'Talk-Show',
 'Thriller',
 'War',
 'Western']

In [82]:
for g in all_genres:
  df[g] = df["genres"].apply(lambda x: 1 if len(set(x.split(", ")).intersection(set([g]))) > 0 else 0)

In [83]:
df.descriptions = df.descriptions.str.strip()
df.descriptions = df.descriptions.str.replace(
    '...                See full summary\xa0»', '')
df.head()


Unnamed: 0,movie,year,genres,descriptions,Action,Adventure,Animation,Biography,Comedy,Crime,...,Mystery,News,Reality-TV,Romance,Sci-Fi,Sport,Talk-Show,Thriller,War,Western
0,#1915House,2018,"Horror, Thriller",A century of secrets are hidden behind the fre...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,#SquadGoals,2018,"Drama, Thriller","Samantha is a serious journalist, both for the...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,#Victoria,2018,"Action, Drama",Victoria moves from out of town into the poor ...,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1 Angry Black Man,2018,Drama,Mike Anderson is a senior at the quintessentia...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"1 Dead man, 6 Suspects, $65 Million",2018,Mystery,"A low budget but slick ""who done it"" involving...",0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [84]:
def stopword_removal(x, stpws=stopwords.words("english")):
  return list(set(x).difference(set(stpws)))

def lemmatization(x, stemmer=PorterStemmer()):
  return [stemmer.stem(i) for i in x]

def number_removal(x):
  return [i for i in x if not i.isnumeric()]

In [85]:
df["corpus"] = df["descriptions"].apply(lambda x: word_tokenize(x))
df["corpus"] = df["corpus"].apply(lambda x: stopword_removal(x))
df["corpus"] = df["corpus"].apply(lambda x: lemmatization(x))
df["corpus"] = df["corpus"].apply(lambda x: number_removal(x))
df["corpus"] = df["corpus"].apply(lambda x: " ".join(x))
corpus = df["corpus"].tolist()


In [86]:
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(corpus)
X = tfidf.toarray()
y = df.iloc[:, 4:28].values


In [87]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [88]:
y

array([[0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

## Classification without LDA

In [89]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [90]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [92]:
filename = 'rf_withoutLDA.sav'
pickle.dump(rf, open(filename, 'wb'))
rf = pickle.load(open(filename, 'rb'))

In [93]:
y_pred = rf.predict(X_test)

In [94]:
f1 = []
for i in range(X_test.shape[0]):
  f1.append(f1_score(y_test[i], y_pred[i]))


In [95]:
f1

[1.0,
 0.5,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.6666666666666666,
 1.0,
 0.0,
 0.5,
 0.0,
 0.0,
 0.6666666666666666,
 0.8,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.6666666666666666,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.5,
 0.6666666666666666,
 0.0,
 1.0,
 0.5,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.6666666666666666,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.6666666666666666,
 0.6666666666666666,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.5,
 0.0,
 0.5,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.5,
 0.5,
 0.6666666666666666,
 0.0,
 1.0,
 0.0,
 1.0,
 0.6666666666666666,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.6666666666666666,
 0.0,
 1.0,
 0.0,
 0.6666666666666666,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.6666666666

In [96]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score
# print("10CV score:", cross_val_score(rf, X_train, y_train, cv=10))
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.75      0.01      0.02       279
           1       0.00      0.00      0.00       162
           2       0.50      0.03      0.05       120
           3       0.00      0.00      0.00        52
           4       0.85      0.08      0.14       613
           5       0.00      0.00      0.00       179
           6       0.59      0.63      0.61      1006
           7       1.00      0.05      0.09       125
           8       0.50      0.01      0.02       112
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00        36
          11       0.74      0.23      0.35       469
          12       0.00      0.00      0.00        40
          13       0.00      0.00      0.00        34
          14       0.00      0.00      0.00       143
          15       0.00      0.00      0.00         2
          16       1.00      0.25      0.40         4
          17       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Classification with LDA

https://machinelearninggeek.com/latent-dirichlet-allocation-using-scikit-learn/

In [97]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=5, random_state=0)
lda.fit_transform(X_train)

array([[0.03746049, 0.03735392, 0.03684807, 0.85096234, 0.03737518],
       [0.83388607, 0.04140226, 0.04183449, 0.04161374, 0.04126343],
       [0.03946069, 0.03973655, 0.84156798, 0.03971062, 0.03952416],
       ...,
       [0.03523212, 0.03508015, 0.03494415, 0.85960734, 0.03513625],
       [0.03835351, 0.03888871, 0.45896183, 0.42484402, 0.03895193],
       [0.04263864, 0.04243453, 0.04245367, 0.82959355, 0.04287961]])

In [98]:
lda_components = lda.components_
terms = vectorizer.get_feature_names()

for index, component in enumerate(lda_components):
    zipped = zip(terms, component)
    top_terms_key = sorted(zipped, key=lambda t: t[1], reverse=True)[:7]
    top_terms_list = list(dict(top_terms_key).keys())
    print("Topic "+str(index)+": ", top_terms_list)


Topic 0:  ['life', 'the', 'year', 'find', 'live', 'friend', 'two']
Topic 1:  ['the', 'life', 'young', 'film', 'new', 'find', 'friend']
Topic 2:  ['the', 'young', 'find', 'life', 'stori', 'love', 'world']
Topic 3:  ['young', 'friend', 'find', 'one', 'the', 'two', 'famili']
Topic 4:  ['life', 'young', 'man', 'find', 'stori', 'love', 'famili']
