# KNN Model

In [137]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA

# set random state for reproducibility
kwargs = dict(random_state=42)

In [138]:
# Load Datasets
movies = pd.read_csv('../../data/preprocessed/movies_id_updated.csv', sep=',')
actors = pd.read_csv('../../data/raw/actors.csv', sep=',')
countries = pd.read_csv('../../data/raw/countries.csv', sep=',')
directors = pd.read_csv('../../data/raw/directors.csv', sep=',')
genres = pd.read_csv('../../data/raw/genres.csv', sep=',')
locations = pd.read_csv('../../data/raw/locations.csv', sep=',')
movie_tags = pd.read_csv('../../data/raw/movie_tags.csv', sep=',')
ratings = pd.read_csv('../../data/raw/ratings.csv', sep=',')
tags =  pd.read_csv('../../data/raw/tags.csv', sep=',')
omdb = pd.read_csv('../../preprocessed/omdb_cleaned.csv')

In [139]:
#Merging data like Christin -> created csv was 1,11GB 
movies = movies[['id', 'title', 'year']]
movies = movies.rename(columns = {'id':'movieID'})
actors.dropna()
merged_movies = pd.merge(movies, actors, how = 'outer', on='movieID')
countries.dropna()
merged_movies = pd.merge(merged_movies, countries, how = 'outer', on='movieID')

In [140]:
directors.dropna()
merged_movies = pd.merge(merged_movies, directors, how = 'outer', on='movieID')
genres.dropna()
merged_movies = pd.merge(merged_movies, genres, how = 'outer', on='movieID')
merged_movies = pd.merge(merged_movies, ratings, how='outer', on='movieID')
merged_movies = merged_movies.dropna()

In [108]:
merged_movies

Unnamed: 0,movieID,title,year,actorID,actorName,ranking,country,directorID,directorName,genre,user_id,rating
0,1,Toy story,1995.0,annie_potts,Annie Potts,10.0,USA,john_lasseter,John Lasseter,Adventure,1339.0,5.0
1,1,Toy story,1995.0,annie_potts,Annie Potts,10.0,USA,john_lasseter,John Lasseter,Adventure,551.0,3.5
2,1,Toy story,1995.0,annie_potts,Annie Potts,10.0,USA,john_lasseter,John Lasseter,Adventure,336.0,4.5
3,1,Toy story,1995.0,annie_potts,Annie Potts,10.0,USA,john_lasseter,John Lasseter,Adventure,1087.0,3.5
4,1,Toy story,1995.0,annie_potts,Annie Potts,10.0,USA,john_lasseter,John Lasseter,Adventure,1598.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...
59296576,65133,Blackadder Back & Forth,1999.0,tim_mcinnerny,Tim McInnerny,9.0,UK,paul_weiland,Paul Weiland,Comedy,480.0,5.0
59296577,65133,Blackadder Back & Forth,1999.0,tim_mcinnerny,Tim McInnerny,9.0,UK,paul_weiland,Paul Weiland,Comedy,1228.0,3.0
59296578,65133,Blackadder Back & Forth,1999.0,tony_robinson,Tony Robinson,10.0,UK,paul_weiland,Paul Weiland,Comedy,1059.0,4.0
59296579,65133,Blackadder Back & Forth,1999.0,tony_robinson,Tony Robinson,10.0,UK,paul_weiland,Paul Weiland,Comedy,480.0,5.0


In [159]:
# Restricted to X Rows currently for faster calculations (r)
merged_movies = merged_movies.iloc[:100000]

X = np.array(merged_movies.iloc[:,0:11])

y = np.array(merged_movies['rating'])

In [160]:
# Checking Array dimensions
print(X.shape)
print(y.shape)

(100000, 11)
(100000,)


In [161]:
# Preprocess Data: Remove all Strings
le = preprocessing.LabelEncoder()
for i in range(0,11):
    X[:,i] = le.fit_transform(X[:,i])

In [162]:
y = le.fit_transform(y)

In [163]:
# split into training and test set
#TODO Split Data in Three and optimize for validation set to compare rmse and accuracy for Validation and Test Set!

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/5, **kwargs)


In [164]:
# Define Classifier 
#Adjust k neighbors value according to dataset 
#May take a while depending on how large the dataset is chosen above -> r
optimal_k = find_optimal_k()
knn = KNeighborsClassifier(n_neighbors=optimal_k, n_jobs=-1)

Tested Classifier until k=10
Tested Classifier until k=20
Tested Classifier until k=30
Tested Classifier until k=40
Tested Classifier until k=50
Tested Classifier until k=60
Tested Classifier until k=70
Tested Classifier until k=80
Tested Classifier until k=90
Tested Classifier until k=100
Tested Classifier until k=110
Tested Classifier until k=120
Tested Classifier until k=130
Tested Classifier until k=140
Tested Classifier until k=150
Tested Classifier until k=160
Tested Classifier until k=170
Tested Classifier until k=180
Tested Classifier until k=190
Optmial k: 3


In [165]:
#training Classifier
print("Training")
knn.fit(X_train, y_train)

Training


KNeighborsClassifier(n_jobs=-1, n_neighbors=3)

In [166]:
# Do predictions
print("Do Prediction")
pred = knn.predict(X_test)

Do Prediction


In [167]:
# Check Accuracy
print("Accuracy : {}".format(accuracy_score(y_test, pred)))
#RMSE for ratings [0,5]
print("RMSE : {}".format(mean_squared_error(y_test, pred, squared=False)))

Accuracy : 0.60805
RMSE : 1.7691240770505612


In [158]:
#Improve Algorithm and find good k to choose
#Limit data set to X rows for finding optimal k
#!Only execute this cell for full dataset with time and computational power!

#Lets save time for now
limited_movies = merged_movies.iloc[:100000]

X = np.array(limited_movies.iloc[:,0:5])

y = np.array(limited_movies['rating'])

def find_optimal_k():
    k_acc_scores = []
    accuracy = 0
    
    for k in range(1, 200):
        if k % 10 == 0 :
            print("Tested Classifier until k="+str(k))
        
        knn = KNeighborsClassifier(n_neighbors=k,n_jobs=-1)
        knn.fit(X_train, y_train)
        pred = knn.predict(X_test)
        k_acc_scores.append("k({}) = Accuracy: {} RMSE: {} ".format(k, accuracy_score(y_test, pred), mean_squared_error(y_test, pred, squared=False)))
        
        if (accuracy_score(y_test, pred) > accuracy):
            optimal_k = k
            accuracy = accuracy_score(y_test, pred)
   
    #For full info
    #print(k_acc_scores)
    
    print("Optmial k: "+str(optimal_k)+"with accuracy: "+str(accuracy))
    return optimal_k

In [144]:
#Drop NaN rows - check later if it makes large difference!
omdb=omdb.dropna()

# Dimensionality Reduction

In [168]:
#perform principal components analysis on training set
pca = PCA(**kwargs)
pca.fit(X_train)
pca.explained_variance_ratio_.round(8)

array([9.9938421e-01, 3.9600000e-04, 2.0314000e-04, 1.6650000e-05,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00])

In [None]:
X_train_trans = pca.transform(X_train)[:, :2]
X_test_trans = pca.transform(X_test)[:, :2]

In [None]:
knn = KNeighborsClassifier(n_neighbors=2, n_jobs=-1)
print("Training")
knn.fit(X_train_trans, y_train)
print("Do Prediction")
pred = knn.predict(X_test_trans)
# Check Accuracy
print("Accuracy : {}".format(accuracy_score(y_test, pred)))
#RMSE for ratings [0,5]
print("RMSE : {}".format(mean_squared_error(y_test, pred, squared=False)))

In [None]:
#Ansatzidee: 
#1 Seperate labeled and unlabeled data
#2 Use SVM and SSL to train and improve  

In [169]:
#Ansatzidee: 
#1 Seperate labeled and unlabeled data
#2 Use SVM and SSL to train and improve  

In [170]:
knn = KNeighborsClassifier(n_neighbors=2, n_jobs=-1)
print("Training")
knn.fit(X_train_trans, y_train)
print("Do Prediction")
pred = knn.predict(X_test_trans)
# Check Accuracy
print("Accuracy : {}".format(accuracy_score(y_test, pred)))
#RMSE for ratings [0,5]
print("RMSE : {}".format(mean_squared_error(y_test, pred, squared=False)))

Training
Do Prediction
Accuracy : 0.9942
RMSE : 0.20506096654409878
