<h1> Assignment - Data Science Working Student </h1>

***Zoey Hearn***

In [2]:
import pandas as pd
import numpy as np
import random
from math import isnan

Load and preprocess data.

In [4]:
df_ratings = pd.read_csv('title.ratings.tsv', sep='\t')
df_basics = pd.read_csv('title.basics.tsv', sep='\t')
df_akas = pd.read_csv('title.akas.tsv', sep='\t')
df_crew = pd.read_csv('title.crew.tsv', sep='\t')

In [5]:
'''The akas data set shows all the different translations of each movie.
I recorded this information by counting how many translations each
move has'''

df_translations = df_akas.groupby(['titleId']).size().reset_index(name='counts')
df_translations.columns = ['tconst', 'numTranslations']

In [6]:
'''Create dataframe which holds feature information to be used'''

df = df_ratings.join(df_basics.set_index('tconst'),on='tconst')
df = df.join(df_translations.set_index('tconst'),on='tconst')
df = df.join(df_crew.set_index('tconst'),on='tconst')

#drop all types except movies
df = df[df['titleType'].str.contains("movie")]

#drop features that holds irrelevant or sparse information
df = df.drop(['isAdult','endYear','writers','titleType'], axis = 1)

#drop rows where information is unavailable
df = df[pd.notnull(df['runtimeMinutes'])]

#Add column which counts how many movies a director directed
directorPopularity = df['directors'].value_counts().to_dict()
df['directorPopularity'] = df['directors'].map(directorPopularity)

#further preprocess data
df = df.fillna(1)
df = df[df.runtimeMinutes.str.contains("\N") == False]
df = df[df.directors.str.contains("\N") == False]

df = df.reset_index()
df = df.drop(['index'],axis = 1)

Split dataframe into train and test data. Use 80% of data to train and 20% to test.

In [7]:
train_df=df.sample(frac=0.80, random_state=200).sort_index()
test_df=df.drop(train_df.index).sort_index()

Implement a decision tree, which takes into account both continuous and categorical data. For simplicity, divide continuous features into two categories by splitting the data exactly in half, with one category corresponding to the lower half and the other corresponding to the upper half. The categorical data will be split upon whether or not a movie is of a certain genre.

In [8]:
def split_genre(genre, df):
    """For categorical data:
    This function splits a dataframe (node) into
    two child nodes depending on a specified genre"""
    left = df[df['genres'].str.contains(genre)]
    right = df.drop(left.index)
    return left, right

def split_feature(feature, df):
    """For continuous data:
    This function splits the data according to a specified method
    which finds the value to split on"""
    value = split_feature_value(feature, df)
    left = df[df[feature] <= value]
    right = df[df[feature] > value]
    return left, right

def split_feature_value(feature, df):
    """This function decides where to split the continuous feature"""
    df_sorted = df.sort_values(by=[feature])
    value = df_sorted.iloc[int(.5*len(df))][feature]
    return value

def split_feature_value2(feature, df):
    """same as above function"""
    value = (max(df[feature]) - min(df[feature]))/2
    return value

Minimize the squared error from the mean to find the best split. I define information gain as the difference between the squared error from the mean before and after the split.

In [9]:
def get_information_gain_genre(genre, df):
    """
    This function compares the squared error from the mean
    after the split to the squared error from the mean from 
    the mean of the parent node using genre as the splitting criterion.
    """
    original_loss = sum((df['averageRating'].mean()-df['averageRating'])**2)/len(df)
    left, right = split_genre(genre, df)
    left_loss = sum((left['averageRating'].mean()-left['averageRating'])**2)
    right_loss = sum((right['averageRating'].mean()-right['averageRating'])**2)
    information_gain = original_loss - (left_loss + right_loss)/len(df)
    return information_gain

def get_information_gain_feature(feature, df):
    """
    Same as above function except it uses feature, instead of genre,
    as the splitting criterion.
    """
    original_loss = sum((df['averageRating'].mean()-df['averageRating'])**2)/len(df)
    left, right = split_feature(feature, df)
    left_loss = sum((left['averageRating'].mean()-left['averageRating'])**2)
    right_loss = sum((right['averageRating'].mean()-right['averageRating'])**2)
    information_gain = original_loss - (left_loss + right_loss)/len(df)
    return information_gain

To decide which is the best split, iterate over possible splits, and maximize the information gain.

In [10]:
features = ['startYear',
             'runtimeMinutes',
             'numTranslations',
             'directorPopularity',
             'numVotes']

genres = ['Action', 'Adventure', 'Animation', 
          'Biography', 'Comedy', 'Crime', 
          'Documentary', 'Drama', 'Fantasy', 
          'History', 'Horror', 'News', 
          'Sport', 'Thriller', 'War']

def get_split(df):
    """This function decides whether to split using categorical
    or continous data, and which genre/feature to split on,
    using a greedy approach. It iterates through all possible
    splits and stores the information corresponding to the split
    in a dictionary. It then finds the maximum information gain
    and returns a dictionary which stores:
        information gain, 
        the split type (whether it uses a categorical or continous split),
        the feature or genre that it splits on, 
        the two child nodes, 
        and the value corresponding to the regression split."""
    #create empty dictionary
    _dict = {}
    for i in genres:
        _dict.update({get_information_gain_genre(i,df):['genre', i]})
    for i in features:
        _dict.update({get_information_gain_feature(i,df):['regression', i]})
    _dict = {k: _dict[k] for k in _dict if not isnan(k)}
    if _dict[max(_dict.keys())][0] == 'genre':
        left, right = split_genre(_dict[max(_dict.keys())][1], df)
        #genres.remove(_dict[max(_dict.keys())][1])
        value = 0
    if _dict[max(_dict.keys())][0] == 'regression':
        left, right = split_feature(_dict[max(_dict.keys())][1], df)
        value = split_feature_value(_dict[max(_dict.keys())][1], df)
    
    groups = left, right
    return {'gain': max(_dict.keys()), #information gain
            'type_split': _dict[max(_dict.keys())][0], #categorical/regression
            'feature': _dict[max(_dict.keys())][1], #genre/feature
            'groups': groups,
            'value': value} #left/right
    
def get_leaf(split):
    """
    This function finds the average rating corresponding
    to the latest split
    """
    return split['averageRating'].mean()

def split(node, max_depth, min_size, depth):
    """
    This function recursively splits the data, using the 
    get_split function, until all nodes are a leaf node
    which return the average rating
    """
    left, right = node['groups']
    del(node['groups'])
    if len(left) == 0 or len(right) ==0:
        node['left'] = node['right'] = get_leaf(left + right)
        return 
    if depth >= max_depth:
        node['left'], node['right'] = get_leaf(left), get_leaf(right)
        return 
    if len(left) <= min_size:
        node['left'] = get_leaf(left)
    else:
        node['left'] = get_split(left)
        split(node['left'], max_depth, min_size, depth+1)
    # process right child
    if len(right) <= min_size:
        node['right'] = get_leaf(right)
    else:
        node['right'] = get_split(right)
        split(node['right'], max_depth, min_size, depth+1)

Build decision tree according to splitting criteria.

In [11]:
def build_tree(train, max_depth, min_size):
    """
    Creates a decision tree which will be used to predict 
    movie rating.
    function directly taken from: 
    https://machinelearningmastery.com/implement-decision-tree-algorithm-scratch-python/
    Returns a nested dictionary
    """
    root = get_split(train)
    split(root, max_depth, min_size, 1)
    return root

def navigate(tree, test):
    """
    This function takes in the decision tree and a movie,
    and moves through the tree based on the features of the movie
    to return a prediction
    """
    if tree['type_split'] == 'genre':
        if tree['feature'] in test['genres']:
            if isinstance(tree['left'],float):
                return tree['left']
            else: 
                direction = 'left'
        else:
            if isinstance(tree['right'],float):
                return tree['right']
            else: 
                direction = 'right'
    else:
        if test[tree['feature']] < tree['value']:
            if isinstance(tree['left'],float):
                return tree['left']
            else: 
                direction = 'left'
        else:
            if isinstance(tree['right'],float):
                return tree['right']
            else: 
                direction = 'right'
    return navigate(tree[direction],test)

def get_total_loss(tree, test_df):
    model_prediction = 0
    blind_prediction = 0
    for i in range(len(test_df)):
        test = test_df.iloc[i]
        prediction = navigate(tree, test)
        model_prediction += abs(prediction - test['averageRating'])
        blind_prediction += abs(train_df['averageRating'].mean() - test['averageRating'])
    return model_prediction/len(test_df), blind_prediction/len(test_df)

Implement.

In [12]:
max_depth = 10
min_leaf = 100
tree = build_tree(train_df, max_depth, min_leaf)
model_loss, blind_loss = get_total_loss(tree, test_df)
print('Average distance from actual rating using mean:', blind_loss)
print('Average distance from actual rating using model:', model_loss)

('Average distance from actual rating using mean:', 1.0245971312209252)
('Average distance from actual rating using model:', 0.8771174047138115)


Now I'm going to see if I can improve these results by implementing a (small) forrest. Train 10 trees on randomly chosen subsets of the data. Predict rating based on average prediction from each tree.

In [13]:
'''Create forrest.'''

#hyper parameters
num_trees = 15
frac_data = .01
max_depth = 5
min_leaf = 50

forrest = []
for i in range(num_trees):
    train = train_df.sample(frac = 0.01)
    tree = build_tree(train, 10, 50)
    forrest.append(tree)

In [14]:
def get_prediction(forrest, test):
    '''Function that takes a sample and runs it through all the 
    decision trees in the forrest and outputs the mean prediction'''
    predictions = []
    for i in forrest:
        prediction = navigate(i, test)
        predictions.append(prediction)
    return np.mean(predictions)

In [15]:
def get_total_loss(forrest, test_df):
    """Finds the average distance from the correct rating
    for a test dataframe, based on the predictions from the forrest"""
    model_prediction = 0
    for i in range(len(test_df)):
        test = test_df.iloc[i]
        prediction = get_prediction(forrest, test)
        model_prediction += abs(prediction - test['averageRating'])
    return model_prediction/len(test_df)

In [16]:
forrest_loss = get_total_loss(forrest, test_df)

In [17]:
print('Average distance from actual rating using forrest:', forrest_loss)

('Average distance from actual rating using forrest:', 0.8919985090910499)
