<h2>Import all relevant packages</h2>

In [1]:
import pandas as pd
import numpy as np
import math
from datetime import datetime

<h2>Movie dataset</h2>
Editing & cleaning OmDB Dataset

In [None]:
movies = pd.read_csv('../data/raw/movies.csv', sep=',')
actors = pd.read_csv('../data/raw/actors.csv', sep=',')
countries = pd.read_csv('../data/raw/countries.csv', sep=',')
directors = pd.read_csv('../data/raw/directors.csv', sep=',')
genres = pd.read_csv('../data/raw/genres.csv', sep=',')
locations = pd.read_csv('../data/raw/locations.csv', sep=',')
movie_tags = pd.read_csv('../data/raw/movie_tags.csv', sep=',')
ratings = pd.read_csv('../data/raw/ratings.csv', sep=',')
tags =  pd.read_csv('../data/raw/tags.csv', sep=',')

In [None]:
tags = tags.rename(columns = {'id':'tagID'})
tags_movies_merged = pd.merge(movie_tags, tags, how = 'outer', on = 'tagID')

In [None]:
tags_movies_merged

In [None]:
tags_new = tags_movies_merged.dropna()

In [None]:
movies = movies[['id', 'title', 'year']]

In [None]:
movies = movies.rename(columns = {'id':'movieID'})

In [None]:
merged_movies = pd.merge(movies, tags_new, how = 'outer', on='movieID')

In [None]:
merged_movies

In [None]:
ratings

In [None]:
merged_movies = pd.merge(merged_movies, ratings, how='outer', on='movieID')

In [None]:
merged_movies.sort_values(by=['user_id'], inplace=True)

In [None]:
merged_movies

In [None]:
year = np.array(merged_movies['year'], np.int16)

In [None]:
year.astype(int)

In [None]:
merged_movies['year'] = year

In [None]:
merged_movies

<h2>OmDB</h2>

Editing & cleaning OmDB Dataset

In [2]:
# read omdb_totoal.csv
omdb = pd.read_csv('../data/raw/omdb_total.csv', sep=',')

# delete unwanted columns
omdb = omdb.drop(columns={'Unnamed: 0', 'Response'})
# delete sparse or unimportant columns
omdb = omdb.drop(columns={'Production', 'Website', 'totalSeasons', 'Season', 'Episode', 'seriesID', 'Type', 'BoxOffice', 'DVD', 'Poster'})
# delete duplicate ratings
omdb = omdb.drop(columns={'Internet Movie Database', 'Metacritic'})

In [3]:
#transforming rotten tomatoes into float
omdb['Rotten Tomatoes'] = omdb['Rotten Tomatoes'].str.replace(r'\D', '').astype(float)

In [4]:
#define method in order to extract relevant trophies and nominations (Oscars, Golden Globes, Wins, Nominations)
def awards(x):
    
    x['Oscars_won'] = np.NaN
    x['Oscars_nominated'] = np.NaN
    x['Globes_won'] = np.NaN
    x['Globes_nominated'] = np.NaN  
    x['Award_wins'] = np.NaN
    x['Award_nominations'] = np.NaN  
    
    for index, row in x.iterrows():
        y = row['Awards']
        
        nom = "Nominated"
        won = "Won"
    
        oscar = "Oscar"
        globes = "Golden Globe"
        wins = "win"
        nom2 = "nomination"
        
        if isinstance(y,str):
            
            if oscar in y:   
                if y.startswith(won):
                    x.at[index, 'Oscars_won'] = float(y[(y.find(oscar)-2):y.find(oscar)])
                    x.at[index, 'Oscars_nominated'] = float(y[(y.find(oscar)-2):y.find(oscar)])
                    
                else:
                    x.at[index, 'Oscars_won'] = 0.0
                    x.at[index, 'Oscars_nominated'] = float(y[(y.find(oscar)-2):y.find(oscar)])
            else: 
                x.at[index, 'Oscars_won'] = 0.0
                x.at[index, 'Oscars_nominated'] = 0.0
             
            if globes in y:   
                if y.startswith(won):
                    x.at[index, 'Globes_won'] = float(y[(y.find(globes)-2):y.find(globes)])
                    x.at[index, 'Globes_nominated'] = float(y[(y.find(globes)-2):y.find(globes)])
                    
                else:
                    x.at[index, 'Globes_won'] = 0.0
                    x.at[index, 'Globes_nominated'] = float(y[(y.find(globes)-2):y.find(globes)])
            else: 
                x.at[index, 'Globes_won'] = 0.0
                x.at[index, 'Globes_nominated'] = 0.0
            
            if wins in y:
                try:
                    x.at[index, 'Award_wins'] = float(y[(y.find(wins)-3):y.find(wins)])
                except:
                    x.at[index, 'Award_wins'] = float(y[(y.find(wins)-2):y.find(wins)])
            else:
                x.at[index, 'Award_wins'] = 0.0
            if nom2 in y:
                try:
                    x.at[index, 'Award_nominations'] = float(y[(y.find(nom2)-3):y.find(nom2)])
                except:
                    x.at[index, 'Award_nominations'] = float(y[(y.find(nom2)-2):y.find(nom2)])
            else:
                x.at[index, 'Award_nominations'] = 0.0
        else:
            x.at[index, 'Oscars_won'] = 0.0
            x.at[index, 'Oscars_nominated'] = 0.0
            x.at[index, 'Globes_won'] = 0.0
            x.at[index, 'Globes_nominated'] = 0.0
            x.at[index, 'Award_wins'] = 0.0
            x.at[index, 'Award_nominations'] = 0.0

In [5]:
#apply method and drop old column
awards(omdb)
omdb = omdb.drop(['Awards'], axis=1)

In [6]:
#define method in order to know which genres the movies have (firstly find all categories and add a column for each unique value)
#assign 1.0 if movie is in the genre, else assign 0.0
def genre(x):
    _set = set()
    
    for index, row in x.iterrows():
        y = row['Genre']
        if isinstance(y,str):
            for i in y.split(','):
                _set.add(i.strip())
            ls = list(_set)
            ls.sort()
    for i in ls:
        x[i] = 0.0
    
    for index, row in x.iterrows():
        y = row['Genre']
        if isinstance(y,str):
            for i in y.split(','):
                i = i.strip()
                
                if i == 'Action':
                    x.at[index, 'Action'] = 1.0
                if i == 'Adult':
                    x.at[index, 'Adult'] = 1.0
                if i == 'Adventure':
                    x.at[index, 'Adventure'] = 1.0
                if i == 'Animation':
                    x.at[index, 'Animation'] = 1.0
                if i == 'Biography':
                    x.at[index, 'Biography'] = 1.0
                if i == 'Comedy':
                    x.at[index, 'Comedy'] = 1.0
                if i == 'Documentary':
                    x.at[index, 'Documentary'] = 1.0
                if i == 'Drama':
                    x.at[index, 'Drama'] = 1.0
                if i == 'Family':
                    x.at[index, 'Family'] = 1.0
                if i == 'Fantasy':
                    x.at[index, 'Fantasy'] = 1.0
                if i == 'Crime':
                    x.at[index, 'Crime'] = 1.0
                if i == 'Film-Noir':
                    x.at[index, 'Film-Noir'] = 1.0
                if i == 'History':
                    x.at[index, 'History'] = 1.0
                if i == 'Horror':
                    x.at[index, 'Horror'] = 1.0
                if i == 'Music':
                    x.at[index, 'Music'] = 1.0
                if i == 'Musical':
                    x.at[index, 'Musical'] = 1.0
                if i == 'Mystery':
                    x.at[index, 'Mystery'] = 1.0
                if i == 'News':
                    x.at[index, 'News'] = 1.0
                if i == 'Reality-TV':
                    x.at[index, 'Reality-TV'] = 1.0
                if i == 'Romance':
                    x.at[index, 'Romance'] = 1.0
                if i == 'Sci-Fi':
                    x.at[index, 'Sci-Fi'] = 1.0
                if i == 'Short':
                    x.at[index, 'Short'] = 1.0
                if i == 'Sport':
                    x.at[index, 'Sport'] = 1.0
                if i == 'Talk-Show':
                    x.at[index, 'Talk-Show'] = 1.0
                if i == 'Thriller':
                    x.at[index, 'Thriller'] = 1.0
                if i == 'War':
                    x.at[index, 'War'] = 1.0
                if i == 'Western':
                    x.at[index, 'Western'] = 1.0

In [7]:
#apply method and drop old column
genre(omdb)
omdb = omdb.drop(['Genre'], axis=1)

In [8]:
# clean typos and convert all parental-guideline ratings into a scale from 0 - 4 according to suitedness for children
rated_dic = {'R': 3, 'PG-13': 2, 'PG': 1, 'Not Rated': np.NaN, 'nan': np.NaN, 'Approved': np.NaN, 'G': 0, 'Passed': np.NaN, 'Unrated': np.NaN, '14': 2, 'GP': 1, 'NC-17': 4, 'NOT RATED': np.NaN, 'APPROVED': np.NaN, 'MA': 4, 'UNRATED': np.NaN, 'PASSED': np.NaN, 'M': np.NaN, 'M/PG': np.NaN, 'X': np.NaN, 'Y7': 0}
omdb['Rated'] = omdb['Rated'].astype(str).apply(lambda x: x.replace("TV-","")).replace(rated_dic)

In [9]:
#fill all NaN values with average of each column as movie seems mediocre (not too bad to downvote, not too good to upvote)

omdb['Metascore'].fillna((math.ceil(omdb['Metascore'].mean())), inplace=True)
omdb['imdbRating'].fillna((math.ceil(omdb['imdbRating'].mean())), inplace=True)
omdb['Rotten Tomatoes'].fillna((math.ceil(omdb['Rotten Tomatoes'].mean())), inplace=True)
omdb['imdbVotes'].fillna(0, inplace=True)

In [10]:
#convert release dates into seasons 
#(in order to compare spring(march,april,may)=2, summer(june,july,august)=3, fall(september,october,november=4 & winter=1)
def dates(x):
    omdb['Released'] = pd.to_datetime(omdb['Released'])
    omdb['Released_season'] = np.NaN
    omdb['Released_month'] = np.NaN
    omdb['Released_day'] = np.NaN
    
    for index, row in x.iterrows():
        y = row['Released']
        x.at[index, 'Released_season'] = (y.month%12 + 3)//3
        x.at[index, 'Released_month'] = y.month
        x.at[index, 'Released_day'] = y.day

In [11]:
dates(omdb)
omdb = omdb.drop(['Released'], axis=1)

In [12]:
#transforming runtime into float
omdb['Runtime'] = omdb['Runtime'].str[:-4].astype(float)

In [29]:
#transform language attribute
def language(x):
    _set = set()
    
    for index, row in x.iterrows():
        y = row['Language']
        if isinstance(y,str):
            for i in y.split(','):
                _set.add(i.strip())
            ls = list(_set)
            ls.sort()
    
    _dict = {}  
    for i in ls: 
        count=0
        for index, row in x.iterrows():
            y = row['Language']
            if isinstance(y,str):    
                if i in y:
                    count = count+1
                    _dict[i] = count
                     
    ls_ = sorted(_dict, key=_dict.get, reverse=True)[:5]        
    
    for i in ls_ :
        omdb[i] = np.nan
    
    for i in ls_:
        for index, row in x.iterrows():
            y = row['Language']
            if isinstance(y,str): 
                if i in y:
                    x.at[index, i] = 1.0
                else:
                    x.at[index,i] = 0.0
            

In [30]:
language(omdb)

174
['Aboriginal', 'Acholi', 'Afrikaans', 'Albanian', 'Algonquin', 'American Sign Language', 'Amharic', 'Ancient (to 1453)', 'Apache languages', 'Arabic', 'Aramaic', 'Arapaho', 'Armenian', 'Assamese', 'Assyrian Neo-Aramaic', 'Athapascan languages', 'Awadhi', 'Azerbaijani', 'Bambara', 'Belarusian', 'Bengali', 'Berber languages', 'Bosnian', 'Brazilian Sign Language', 'British Sign Language', 'Bulgarian', 'Burmese', 'Cantonese', 'Catalan', 'Central American Indian languages', 'Chechen', 'Cheyenne', 'Chinese', 'Cornish', 'Corsican', 'Cree', 'Croatian', 'Crow', 'Czech', 'Danish', 'Dari', 'Dutch', 'Dzongkha', 'Egyptian (Ancient)', 'English', 'Esperanto', 'Estonian', 'Ewe', 'Filipino', 'Finnish', 'Flemish', 'French', 'French Sign Language', 'Fur', 'Gallegan', 'Georgian', 'German', 'German Sign Language', 'Greek', 'Guarani', 'Hakka', 'Hawaiian', 'Hebrew', 'Hindi', 'Hmong', 'Hokkien', 'Hopi', 'Hungarian', 'Icelandic', 'Indonesian', 'Inuktitut', 'Irish', 'Italian', 'Japanese', 'Japanese Sign Lan

<h2>Joining the data</h2>
After joining the relevant data you can test different approaches in order to predict the ratings

In [None]:
full_data = pd.merge(omdb, merged_movies, how='outer', on=['title','year'])

In [None]:
full_data

In [None]:
full_data = full_data.replace([np.inf, -np.inf], np.nan).dropna()

In [None]:
full_data.dtypes


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA


In [None]:
X = np.array(full_data.iloc[:,0:11])

y = np.array(full_data['rating'])

In [None]:
le = preprocessing.LabelEncoder()
for i in range(0,11):
    X[:,i] = le.fit_transform(X[:,i])

In [None]:
y = le.fit_transform(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/5)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logreg = LogisticRegression()

In [None]:
logreg.fit(X_train, y_train)

In [None]:
y_pred = logreg.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

In [None]:
from sklearn import tree

In [None]:
clf = tree.DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test,y_pred)