In [1]:

import pandas as pd  
from sklearn.naive_bayes import MultinomialNB 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.pipeline import Pipeline  
from sklearn.model_selection import train_test_split  
from sklearn.metrics import confusion_matrix, accuracy_score  
import matplotlib.pyplot as plt  
import seaborn as sn  
import warnings

warnings.filterwarnings("ignore")

In [2]:
# Reading the Data
TestData = pd.read_csv("test_data.csv")
TestData = TestData[["TITLE", "DESCRIPTION"]]
TestDataSol = pd.read_csv("test_data_solution.csv")
TrainData = pd.read_csv("train_data.csv")

In [3]:
TestData.head()

Unnamed: 0,TITLE,DESCRIPTION
0,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apar..."
1,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty ch..."
2,Off the Beaten Track (2010),One year in the life of Albin and his family ...
3,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with hi..."
4,Er nu zhai (1955),Before he was known internationally as a mart...


In [4]:
TrainData.head()

Unnamed: 0.1,Unnamed: 0,TITLE,GENRE,DESCRIPTION
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...


In [5]:
TestDataSol.head()


Unnamed: 0.1,Unnamed: 0,TITLE,GENRE,DESCRIPTION
0,1,Edgar's Lunch (1998),thriller,"L.R. Brane loves his life - his car, his apar..."
1,2,La guerra de papá (1977),comedy,"Spain, March 1964: Quico is a very naughty ch..."
2,3,Off the Beaten Track (2010),documentary,One year in the life of Albin and his family ...
3,4,Meu Amigo Hindu (2015),drama,"His father has died, he hasn't spoken with hi..."
4,5,Er nu zhai (1955),drama,Before he was known internationally as a mart...


In [6]:
import nltk
from nltk.stem import LancasterStemmer
from nltk.corpus import stopwords
import re
import string

# Initialize stemmer and stopwords
stemmer = LancasterStemmer()
stop_words = set(stopwords.words("english"))


def cleaning_data(text):
    text = text.lower()
    text = re.sub(r'@\S+', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'.pic\S+', '', text)
    text = re.sub(r'[^a-zA-Z+]', ' ', text)
    text = "".join([i for i in text if i not in string.punctuation])
    words = nltk.word_tokenize(text)
    text = " ".join([i for i in words if i not in stop_words and len(i) > 2])
    text = re.sub(r"\s+", " ", text).strip()
    return text


TrainData["TextCleaning"] = TrainData["DESCRIPTION"].apply(cleaning_data)
TestData["TextCleaning"] = TestData["DESCRIPTION"].apply(cleaning_data)

In [7]:
# Diffrent Types of Movies
print(len(TrainData.GENRE.unique()))
TrainData.GENRE.unique()

27


array([' drama ', ' thriller ', ' adult ', ' documentary ', ' comedy ',
       ' crime ', ' reality-tv ', ' horror ', ' sport ', ' animation ',
       ' action ', ' fantasy ', ' short ', ' sci-fi ', ' music ',
       ' adventure ', ' talk-show ', ' western ', ' family ', ' mystery ',
       ' history ', ' news ', ' biography ', ' romance ', ' game-show ',
       ' musical ', ' war '], dtype=object)

In [8]:
#encoding

from sklearn.preprocessing import LabelEncoder

GENRElabel = LabelEncoder()
TrainData['GENRE_n'] = GENRElabel.fit_transform(TrainData['GENRE'])

# droping old coulmn
TrainData = TrainData.drop("GENRE", axis=1)

print("Elemnts count : ", 1 + TrainData.GENRE_n.max())

# Displaying the dataframe
TrainData.head()

Elemnts count :  27


Unnamed: 0.1,Unnamed: 0,TITLE,DESCRIPTION,TextCleaning,GENRE_n
0,1,Oscar et la dame rose (2009),Listening in to a conversation between his do...,listening conversation doctor parents year old...,8
1,2,Cupid (1997),A brother and sister with a past incestuous r...,brother sister past incestuous relationship cu...,24
2,3,"Young, Wild and Wonderful (1980)",As the bus empties the students for their fie...,bus empties students field trip museum natural...,1
3,4,The Secret Sin (1915),To help their unemployed father make ends mee...,help unemployed father make ends meet edith tw...,8
4,5,The Unrecovered (2007),The film's title refers not only to the un-re...,film title refers recovered bodies ground zero...,8


In [9]:
clf = Pipeline([('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])

In [10]:
clf.fit(TrainData.TextCleaning,TrainData.GENRE_n)

In [11]:
y_pred = clf.predict(TestData.TextCleaning)

y_true = GENRElabel.fit_transform(TestDataSol['GENRE'])


accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 53.75%


In [12]:

Descriptions = [
    "Listening in to a conversation between his doctor and parents, 10-year-old Oscar learns what nobody has the courage to tell him. He only has a few weeks to live. Furious, he refuses to speak to anyone except straight-talking Rose, the lady in pink he meets on the hospital stairs. As Christmas approaches, Rose uses her fantastical experiences as a professional wrestler, her imagination, wit and charm to allow Oscar to live life and love to the full, in the company of his friends Pop Corn, Einstein, Bacon and childhood sweetheart Peggy Blue.",
    "In tough economic times Max and Joey have all but run out of ideas until, they discover that senior housing is cheap. Not only that but Max's aunt just kicked the bucket and no one knows yet. In a hilarious series that always keeps you on your toes, the two friends take us on a cross-dressing, desperate and endearing ride through being broke."
]
print("Prediction:", GENRElabel.inverse_transform(clf.predict(Descriptions)))

Prediction: [' drama ' ' comedy ']


In [13]:

import ipywidgets as widgets
from IPython.display import display


def on_button_click(b):
    text_value = text_box.value
    print("Film's GENRE is : ",
          GENRElabel.inverse_transform([int(clf.predict([text_value]))]))


button = widgets.Button(description="PREDICT")
text_box = widgets.Text(placeholder="Enter description")
text_box.layout.width = '500px'
text_box.layout.height = '30px'


button.on_click(on_button_click)

display(text_box)
display(button)

Text(value='', layout=Layout(height='30px', width='500px'), placeholder='Enter description')

Button(description='PREDICT', style=ButtonStyle())