### kNN Preprocessing

In [1]:
import pandas as pd
import numpy as np
import string
import math
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.decomposition import PCA
kwargs = dict(random_state=42)

In [2]:
movies = pd.read_csv('../../data/preprocessed/movies_id_updated.csv')
average_ratings = pd.read_csv('../../data/preprocessed/average_ratings.csv')
omdb = pd.read_csv('../../data/preprocessed/omdb_cleaned.csv')
ratings = pd.read_csv('../../data/preprocessed/ratings_clean_std_0.csv')

In [6]:
omdb.columns

Index(['Title', 'Year', 'Rated', 'Runtime', 'Writer', 'Plot', 'Language',
       'imdbRating', 'imdbVotes', 'imdbID', 'Rotten Tomatoes', 'Metacritic',
       'Series', 'Released_season', 'Released_month', 'Released_day',
       'PG_Rating', 'Available_languages', 'Oscars_won', 'Oscars_nominated',
       'Golden_globe_won', 'Golden_globe_nominated'],
      dtype='object')

In [7]:
# Select relevant columns
omdb = omdb[['imdbID', 'Year', 'Runtime', 'Language', 'imdbRating', 'imdbVotes', 'Rotten Tomatoes', 'Metacritic',
       'Series', 'PG_Rating', 'Oscars_won', 'Oscars_nominated',
       'Golden_globe_won', 'Golden_globe_nominated']]

In [8]:
omdb.isna().sum()

imdbID                       0
Year                         0
Runtime                      9
Language                     0
imdbRating                   4
imdbVotes                    5
Rotten Tomatoes           1237
Metacritic                4019
Series                       0
PG_Rating                 2314
Oscars_won                   0
Oscars_nominated             0
Golden_globe_won             0
Golden_globe_nominated       0
dtype: int64

In [9]:
# Add movies which could not be retrieved by omdb
imdb_ids = pd.DataFrame(movies['imdbID'].unique()).rename(columns={0:'imdbID'})
omdb = imdb_ids.merge(omdb, how='left', on='imdbID')

In [14]:
# Dropping the 10 almost empty movies
indices = omdb[omdb['imdbRating'].isna()]['imdbID'].index
for i in indices:
    omdb = omdb.drop([i],axis=0,)

In [15]:
omdb.isna().sum()

imdbID                       0
Year                         0
Runtime                      5
Language                     0
imdbRating                   0
imdbVotes                    1
Rotten Tomatoes           1236
Metacritic                4015
Series                       0
PG_Rating                 2310
Oscars_won                   0
Oscars_nominated             0
Golden_globe_won             0
Golden_globe_nominated       0
dtype: int64

In [16]:
# Replace for Series, PG and awards NaN with 0 and handle accordingly 
for i in range (9,14):
    omdb.iloc[:,i] = omdb.iloc[:,i].fillna(0)

In [17]:
omdb.isna().sum()

imdbID                       0
Year                         0
Runtime                      5
Language                     0
imdbRating                   0
imdbVotes                    1
Rotten Tomatoes           1236
Metacritic                4015
Series                       0
PG_Rating                    0
Oscars_won                   0
Oscars_nominated             0
Golden_globe_won             0
Golden_globe_nominated       0
dtype: int64

In [18]:
# Comment Max: Vorschlag - Stattdessen Median/Modalwert benutzen benutzen
# Fill NaN Series
omdb.loc[:,'Series'] = omdb.loc[:,'Series'].fillna(0)
# Fill NaN Runtime
omdb.loc[:,'Runtime'] = omdb.loc[:,'Runtime'].fillna(omdb.loc[:,'Runtime'].median())
# Fill NaN for imdbVotes
omdb.loc[:,'imdbVotes'] = omdb.loc[:,'imdbVotes'].fillna(omdb.loc[:,'imdbVotes'].median())
#for i in range (4,16):
    #omdb.iloc[:,i] = omdb.iloc[:,i].fillna(omdb.iloc[:,i].median())

In [19]:
omdb['Rotten Tomatoes'].describe()

count    8177.000000
mean        6.223138
std         2.812948
min         0.000000
25%         4.000000
50%         6.800000
75%         8.600000
max        10.000000
Name: Rotten Tomatoes, dtype: float64

In [20]:
omdb['Metacritic'].describe()

count    5398.000000
mean        5.779918
std         1.842826
min         0.100000
25%         4.500000
50%         5.900000
75%         7.100000
max        10.000000
Name: Metacritic, dtype: float64

In [21]:
omdb['Rotten Tomatoes'].isna()

0       False
1       False
2       False
3       False
4       False
        ...  
9418    False
9419    False
9420    False
9421    False
9422     True
Name: Rotten Tomatoes, Length: 9413, dtype: bool

In [22]:
omdb = omdb.rename(columns={"Rotten Tomatoes": "RottenTomatoes"})

In [23]:
omdb.loc[9422, 'RottenTomatoes']

nan

In [24]:
omdb.loc[9421, 'RottenTomatoes']

5.5

In [25]:
omdb['RottenTomatoes'].describe()

count    8177.000000
mean        6.223138
std         2.812948
min         0.000000
25%         4.000000
50%         6.800000
75%         8.600000
max        10.000000
Name: RottenTomatoes, dtype: float64

In [26]:
omdb.loc[0, 'RottenTomatoes']

10.0

In [27]:
# Replace RT Score with Metacritic if NaN
# where Replace values where the condition is False.
omdb['RottenTomatoes'] = omdb['RottenTomatoes'].where(~omdb['RottenTomatoes'].isna(),omdb['Metacritic'])

In [32]:
# Replace Metacritic with RT Scroe if NaN
omdb['Metacritic'].where(~omdb['Metacritic'].isna(),omdb['RottenTomatoes'])

0       9.5
1       3.9
2       5.3
3       5.6
4       4.9
       ... 
9418    6.8
9419    3.3
9420    8.0
9421    4.7
9422    NaN
Name: Metacritic, Length: 9413, dtype: float64

In [35]:
# Fill remaining with mean()
omdb['RottenTomatoes']=omdb['RottenTomatoes'].where(~omdb['RottenTomatoes'].isna(),omdb['RottenTomatoes'].mean())
omdb['Metacritic']=omdb['Metacritic'].where(~omdb['Metacritic'].isna(),omdb['Metacritic'].mean())

In [36]:
omdb['Metacritic'].isna().sum()

0

In [37]:
omdb['RottenTomatoes'].isna().sum()

0

In [38]:
merged_data = ratings.merge(omdb, how='left', on='imdbID')

In [39]:
merged_data.head()

Unnamed: 0.1,Unnamed: 0,user_id,imdbID,rating,Year,Runtime,Language,imdbRating,imdbVotes,RottenTomatoes,Metacritic,Series,PG_Rating,Oscars_won,Oscars_nominated,Golden_globe_won,Golden_globe_nominated
0,0,1264,tt0047034,3.5,1954.0,96.0,['Japanese'],7.6,27485.0,9.3,7.8,0.0,0.0,0.0,0.0,0.0,0.0
1,1,213,tt0304141,2.5,2004.0,142.0,['English'],7.9,524511.0,9.0,8.2,0.0,1.0,0.0,2.0,0.0,0.0
2,2,593,tt0369436,3.0,2008.0,88.0,['English'],5.7,61335.0,2.4,4.1,0.0,2.0,0.0,0.0,0.0,0.0
3,3,609,tt1077258,4.0,2007.0,105.0,['English'],7.1,196149.0,7.4,5.779918,0.0,0.0,0.0,0.0,0.0,0.0
4,4,1590,tt0052182,4.0,1958.0,100.0,['English'],7.4,6337.0,6.9,5.779918,0.0,0.0,2.0,0.0,0.0,0.0


In [40]:
merged_data = merged_data.drop(columns={'Unnamed: 0','Language'})

In [41]:
# Comment Max: No NaN rows anymore - except for language for the missing movies
merged_data.isna().sum()

user_id                     0
imdbID                      0
rating                      0
Year                      269
Runtime                   269
imdbRating                269
imdbVotes                 269
RottenTomatoes            269
Metacritic                269
Series                    269
PG_Rating                 269
Oscars_won                269
Oscars_nominated          269
Golden_globe_won          269
Golden_globe_nominated    269
dtype: int64

In [42]:
# convert imdbID from string to float
merged_data['imdbID'] = merged_data['imdbID'].str.replace(r'tt','')
merged_data['imdbID'] = merged_data['imdbID'].astype(float)
merged_data['imdbID'].unique()

array([ 47034., 304141., 369436., ...,  81433., 295480., 298072.])

In [45]:
#Statt    
    #Drop NaN rows - check later if it makes large difference!
#merged_data = merged_data.dropna()

# Jetzt einfach ma5 mean() eingefüllt
for i in range (3,15):
    merged_data.iloc[:,i] = merged_data.iloc[:,i].fillna(merged_data.iloc[:,i].median())