In [1]:
#Import all python package needed for importing data
# import pandas as pd
# import numpy as np
# from sklearn.model_selection import StratifiedShuffleSplit,train_test_split,GridSearchCV
# from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
# from sklearn.preprocessing import StandardScaler
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.model_selection import RepeatedStratifiedKFold

# import matplotlib.pyplot as plt

In [2]:
#using pymongo and pandas to connect mongodb to juypter and pandas to convert into df/dt
import pandas as pd
from pymongo import MongoClient


def _connect_mongo(host, port, username, password, database_name):
    """ A util for making a connection to mongo """

    if username and password:
        mongo_uri = 'mongodb://%s:%s@%s:%s/%s' % (username, password, host, port, database_name)
        conn = MongoClient(mongo_uri)
    else:
        conn = MongoClient(host, port)


    return conn[database_name]


def read_mongo(database_name, collection_name, query, host, port, username, password, no_id = True):
    """ Read from Mongo and Store into DataFrame """

    # Connect to MongoDB
    database_name = _connect_mongo(host=host, port=port, username=username, password=password, database_name=database_name)

    # Make a query to the specific DB and Collection
    cursor = database_name[collection_name].find(query)

    # Expand the cursor and construct the DataFrame
    df =  pd.DataFrame(iter(cursor))

    # Delete the _id
    if '_id' in df.columns:
        del df['_id']

    return df

In [3]:
#declare database from mongodb

#Change depending on user parameters
host = "localhost"
port = 27017
username=None
password=None
query = {}
mongo_uri = "mongodb://localhost:27017"
database_name = "Fyp-Test"
collection_name = "Netflix_Titles"

#Read file from database
df = read_mongo(database_name,collection_name,query,host,port,username,password)
df.columns


Index(['show_id', 'type', 'title', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description',
       'director'],
      dtype='object')

In [4]:
#Pre-propessing data 
#We want to find out what the relation between Rating ,duration and genre to see if there any correlation


#Remove columns except rating, type and genre
df = df[['rating','type','listed_in']]

#Check for any null values, if there is remove it
df['rating'].isnull().sum().sum()
df['type'].isnull().sum().sum()
df['listed_in'].isnull().sum().sum()

df.columns

Index(['rating', 'type', 'listed_in'], dtype='object')

In [5]:
#Use get_dummies to sort the ratings
df = pd.get_dummies(df,prefix=['rating'], columns = ['rating'], drop_first=True)
df.columns



Index(['type', 'listed_in', 'rating_NC-17', 'rating_NR', 'rating_PG',
       'rating_PG-13', 'rating_R', 'rating_TV-14', 'rating_TV-G',
       'rating_TV-MA', 'rating_TV-PG', 'rating_TV-Y', 'rating_TV-Y7',
       'rating_TV-Y7-FV', 'rating_UR'],
      dtype='object')

In [6]:
#separted the type into movies and tv show using bins
#create a function to split movies to 1 and TV shows to 2
def fun(types):
    if types == 'Movie':
        return '1'
    elif types =='TV Show':
        return '2'

#pass function to df
df['type'] = df['type'].apply(fun)

In [7]:
#Split listed_in into different cat
from pandas import Series


temp_df = df['listed_in'].str.split(', ').apply(Series,1).stack()

#lined up with df index
temp_df.index = temp_df.index.droplevel(-1)
#name the column to join
temp_df.name = 'listed_in'
#del current listed_in
del df['listed_in']
#join the new listed_in
df = df.join(temp_df)



In [8]:
#onehot the listen_in
df = pd.get_dummies(df,prefix=['listed_in'], columns = ['listed_in'], drop_first=True)
df.columns
#Data preprocessing is done

Index(['type', 'rating_NC-17', 'rating_NR', 'rating_PG', 'rating_PG-13',
       'rating_R', 'rating_TV-14', 'rating_TV-G', 'rating_TV-MA',
       'rating_TV-PG', 'rating_TV-Y', 'rating_TV-Y7', 'rating_TV-Y7-FV',
       'rating_UR', 'listed_in_Anime Features', 'listed_in_Anime Series',
       'listed_in_British TV Shows', 'listed_in_Children & Family Movies',
       'listed_in_Classic & Cult TV', 'listed_in_Classic Movies',
       'listed_in_Comedies', 'listed_in_Crime TV Shows',
       'listed_in_Cult Movies', 'listed_in_Documentaries',
       'listed_in_Docuseries', 'listed_in_Dramas',
       'listed_in_Faith & Spirituality', 'listed_in_Horror Movies',
       'listed_in_Independent Movies', 'listed_in_International Movies',
       'listed_in_International TV Shows', 'listed_in_Kids' TV',
       'listed_in_Korean TV Shows', 'listed_in_LGBTQ Movies',
       'listed_in_Movies', 'listed_in_Music & Musicals',
       'listed_in_Reality TV', 'listed_in_Romantic Movies',
       'listed_in_Rom

In [9]:
#We need to split the data into x_train, y_train , x_test , y_test
#For this case we will be using train_test_split , in the future we will be using the others
from sklearn.model_selection import StratifiedShuffleSplit,train_test_split,GridSearchCV

#declare x and y to be df and df['type']
#We will split into 20/80
x = df.iloc[:,df.columns !='type']
y = df['type']

x_train , x_test ,y_train , y_test = train_test_split(x,y, test_size=0.2)



In [10]:
#Once we have split the data we can choose a classification model since we want to find out if type and listed_in affects the rating
#Since the outcome will either be 1 or 0 depending if the ratings is yes or no , we have 5 options : 
# Naive Bayes
# k-Nearest Neighbors
# Decision Trees
# Support Vector Machine
# Logistic Regression
#we will start with Naive Bayes

In [11]:
# Naive Bayes
# first we must import gaussiannb(The basic Naive bayes)
from sklearn.naive_bayes import GaussianNB

#we will set a var to Gaussian
Gaussian = GaussianNB()

#First we will test with our train data first without changing the parameters - Pre tuning
pre_tune_NB_fit = Gaussian.fit(x_train, y_train)
pre_tune_NB_predict = pre_tune_NB_fit.predict(x_test)


In [12]:
#import using sklearn to present the predicted data
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

#Shows the accuracy/information of the predict
print(confusion_matrix(y_test, pre_tune_NB_predict))
print(classification_report(y_test, pre_tune_NB_predict))
print(accuracy_score(y_test, pre_tune_NB_predict))

[[2244   81]
 [   0 1090]]
              precision    recall  f1-score   support

           1       1.00      0.97      0.98      2325
           2       0.93      1.00      0.96      1090

    accuracy                           0.98      3415
   macro avg       0.97      0.98      0.97      3415
weighted avg       0.98      0.98      0.98      3415

0.976281112737921


In [13]:
#post tune
from sklearn.model_selection import RepeatedStratifiedKFold
#Set the numbers of folds and cross validation
cv_method = RepeatedStratifiedKFold(n_splits=5, 
                                    n_repeats=3, 
                                    random_state=999)

import numpy as np
np.logspace(0,-9, num=10)

np.random.seed(999)

#Change Portion of the largest variance of all features that is added to variances for calculation stability.
params_NB = {'var_smoothing': np.logspace(0,-9, num=100)}

gs_NB = GridSearchCV(estimator=Gaussian, 
                     param_grid=params_NB, 
                     cv=cv_method,
                     verbose=1,
                     scoring='accuracy')

In [14]:
#Fit the model
post_tune_NB_fit = gs_NB.fit(x_train,y_train)

#Predict after post tune
post_tune_NB_predict =post_tune_NB_fit.predict(x_test)

Fitting 15 folds for each of 100 candidates, totalling 1500 fits


In [15]:
#Shows the accuracy after post tuning
print(confusion_matrix(y_test, post_tune_NB_predict))
print(classification_report(y_test, post_tune_NB_predict))
print(accuracy_score(y_test, post_tune_NB_predict))



[[2317    8]
 [   0 1090]]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00      2325
           2       0.99      1.00      1.00      1090

    accuracy                           1.00      3415
   macro avg       1.00      1.00      1.00      3415
weighted avg       1.00      1.00      1.00      3415

0.9976573938506589


In [16]:
#Now we have the accuracy to its highest, we need to present it so that its presentable 
#We shall us matplot to present our plots
import matplotlib.pyplot as plt