In [1]:
import pandas as pd
import numpy as np
from pydataset import data

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier


from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

from fomlads.data.external import import_for_classification

from fomlads.model.classification import project_data
from fomlads.model.classification import maximum_separation_projection

from fomlads.plot.exploratory import plot_scatter_array_classes
from fomlads.plot.exploratory import plot_class_histograms
from fomlads.model.classification import fisher_linear_discriminant_projection


pd.set_option("display.max_columns", None)

In [2]:



# I wrote the get_fisher_histogram function to avoid repeated code
# fisher_linear_discriminant_projection, project_data, and plot_class_histograms are all from fomlads
def get_fisher_histogram(inputs, targets):
    """
    Returns the projected histogram using Fishers model
    """
    
    w = fisher_linear_discriminant_projection(inputs, targets)
    projected_inputs = project_data(inputs, w)
    ax = plot_class_histograms(projected_inputs, targets)

   
    
def get_fishers_predictions(inputs, weights): 
    
    """
    Predicts the targets based on inputs and weights.
    The weights are caluculated using Fishers criterion.
    """
        
    m = np.mean(inputs, axis = 0)
    predictions = []
    
    for x in inputs:
        y = np.matmul(weights.T, x-m) # y = w.T * (x - m)
        if y > 0:
            predictions.append(1)
        else:
            predictions.append(0)

    return predictions
        
def get_accuracy(predictions, targets):
    """
    Calculates what proportion of the guesses are correct.
    For example, if 80% of the values match, returns 0.8.
    """
    
    
    scores = []
    for i in range(len(targets)):
        if predictions[i] == targets[i]:
            scores.append(1)
        else:
            scores.append(0)
        
    accuracy = sum(scores)/len(targets)
    return accuracy

def get_train_test_split(df, train_size = 0.7):
    
    """
    Splits the dataframe into a training set and a testing set.
    train_size determiens how much of the data goes into the trainig set (0.7 means 70%).
    Returns the training set and testing set.
    """
    
    # Code for sampling adapted from: https://stackoverflow.com/questions/24147278/how-do-i-create-test-and-train-samples-from-one-dataframe-with-pandas
    
    train = df.sample(frac = train_size) # Use random_state to fix a seed value
    test = df.drop(train.index)
    
#     x_train = train.drop(y_label, axis = 1)
#     y_train = train[y_label]
    
#     x_test = test.drop(y_label, axis = 1)
#     y_test = test[y_label]
    
    return train, test

# Code adapted from: https://stackoverflow.com/questions/2130016/splitting-a-list-into-n-parts-of-approximately-equal-length
def split(a, n):
    """
    Splits a list into n parts of approximately equal length.
    """
    if n == 0:
        raise Exception(" Cannot split a list into 0 parts, enter a valid input for n")
    elif n == 1:
        return a
    else:    
        k, m = divmod(len(a), n)
        return list(a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))


def get_fold_indices(df, num_folds = 10):
    """
    Retruns the indices for each iteration of cross validation.
    For example, if input is has 10 rows and needs 5 folds:
    the first iteration has indices [0, 1] for testing and [2, 3, 4, 5, 6, 7, 8, 9] for training
    the second iteration has indices [2, 3] for testing and [0, 1, 4, 5, 6, 7, 8, 9] for training 
    and so on.
    
    train_indices[0] will give you the row numbers of the training data for the first iteration  
    val_indices[0] will give you the row numbers of the validation data for the first iteration  

    """
    
    
    indices = df.index.values.tolist()
    N = len(indices)   
        
    train_indices = []
    val_indices = []
    
    if num_folds == 0 or num_folds == 1:
        raise Exception(" Cannot cross validate with less than 2 folds")
    else:      
    
        splits = split(indices, num_folds)

        for i in range(num_folds):
            v = splits[i]

            t = splits[:i] + splits[i+1:]        
            t =  [item for sublist in t for item in sublist]



            train_indices.append(t)
            val_indices.append(v)


        return train_indices, val_indices
        
        
# This is just to check that get_fold_indices() works.    
x = np.arange(10)
dfx = pd.DataFrame(x, columns = ["x"])
t,v = get_fold_indices(dfx, 2)

print(t)
print("____________________________________")
print(v)
    


[[5, 6, 7, 8, 9], [0, 1, 2, 3, 4]]
____________________________________
[[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]


In [3]:

# ######################################################################
# # Iris data from tutorial, just to check if fisher model is working.

# ifname = 'iris.data'

# # reimport data, just 2 classes
# classes = ["Iris-setosa", "Iris-versicolor"]
# inputs, targets, field_names, classes = import_for_classification(ifname, classes=classes)
# N = len(inputs)

# randomize = np.arange(len(targets))
# np.random.shuffle(randomize)
# inputs = inputs[randomize]
# targets = targets[randomize]


# x_train = inputs[0:int(N*0.7)]
# y_train = targets[0:int(N*0.7)]

# x_test = inputs[int(N*0.7):]
# y_test = targets[int(N*0.7):]


# # Using fomlads model
# fishers_weights = fisher_linear_discriminant_projection(inputs,targets)
# fishers_predictions = get_fishers_predictions(x_test, fishers_weights)
# accuracy = get_accuracy(fishers_predictions, y_test)
# print(f"Fomlads score = {accuracy * 100}%")


# # Using sklearn model
# lda = LinearDiscriminantAnalysis()
# lda.fit(x_train, y_train)
# lda_score = lda.score(x_test, y_test)


# print(f"sklearn score = {lda_score * 100}%")


In [19]:

####################################################################
# Our dataset

df = pd.read_csv("profit_x_y.csv")
df = df.drop("Unnamed: 0", axis =1 )


df = df.drop("title_x", axis = 1)
df = df.drop("title_y", axis = 1)
df = df.drop("profit_x", axis = 1)
df = df.drop("profit_y", axis = 1)

# Had to remove gross income since it can be used with budget to directly get profit,
# and an unreleased movie will not have a known income.

df = df.drop("worlwide_gross_income_x", axis = 1)
df = df.drop("worlwide_gross_income_y", axis = 1)

# Using a filter such as this one (or action movies only etc.) gives an error 
# about a singular matrix when calculating the weights. I'm not sure what's causing it.

# df = df[df["LANGUAGE_English_x"] == 1]


# df_train will be used for cross validation, and df_test will be used for final testing
df_train, df_test = get_train_test_split(df,train_size = 0.8)

# Re-calculates the indices because pandas normally keeps the original indices when to take a section of a dataframe
df_train.reset_index(inplace=True, drop=True) 
df_test.reset_index(inplace=True, drop=True) 



# Gets the row numbers for the training folds and the validaition folds for each iteration.
# tf_indices -> indices for training folds for each iteration
# vf_indices -> indices for validation folds for each iteration
tf_indices, vf_indices = get_fold_indices(df_train, num_folds = 10)

# Keeps track of the scores for each iteration of cross validation.
cross_validation_scores = []

# Performs cross validation
for i in range(len(tf_indices)):  

    # Training and validation data for the current iteration/
    train_fold_df = df_train.iloc[tf_indices[i]]
    validation_fold_df = df_train.iloc[vf_indices[i]]
    
    # X and y data for training fold
    train_fold_x = train_fold_df.drop("profit_xy", axis = 1).to_numpy()
    train_fold_y = train_fold_df["profit_xy"].to_numpy()
    
    # X and y data for validation fold
    validation_fold_x = validation_fold_df.drop("profit_xy", axis = 1).to_numpy()
    validation_fold_y = validation_fold_df["profit_xy"].to_numpy()
    
    # Gets weights from training fold
    fishers_weights = fisher_linear_discriminant_projection(train_fold_x, train_fold_y)
    
    # Makes predictions using weights calculated from training fold
    fishers_predictions = get_fishers_predictions(validation_fold_x, fishers_weights)

    # Calucaltes accuracy for current iteration and appends to list
    fold_accuracy = get_accuracy(fishers_predictions, validation_fold_y)
    cross_validation_scores.append(fold_accuracy)
    

average_cv_percent = round(100 * sum(cross_validation_scores)/len(cross_validation_scores),2)





# Using sklearn model for comparison
lda = LinearDiscriminantAnalysis()
lda.fit(df_train.drop("profit_xy", axis = 1), df_train["profit_xy"])
lda_score = lda.score(df_test.drop("profit_xy", axis = 1), df_test["profit_xy"])
lda_score = round(lda_score, 2)

print(f"Average cross validation score = {average_cv_percent}%")
print(f"sklearn score = {lda_score * 100}%")
df_t, df_v = get_fold_indices(df,10)



Average cross validation score = 71.77%
sklearn score = 71.0%


In [None]:
# # Using sklearn, compares fishers model (LinearDiscriminantAnalysis), logistic regression and support vector machine (LinearSVC).

# # Creats and fits models to traning data
# lda = LinearDiscriminantAnalysis()
# lda.fit(x_train, y_train)

# log_reg = LogisticRegression()
# log_reg.fit(x_train, y_train)

# svm = LinearSVC()
# svm.fit(x_train, y_train)


# # Makes predictions on using data
# y_pred_lda = lda.decision_function(x_test)
# y_pred_logistic = log_reg.decision_function(x_test)
# y_pred_svm = svm.decision_function(x_test)


# # Gets the ROC curves for each model
# # Using the "score" method is not a good metric since guessing "1" everytime will still give a score of ~50%. 

# # fpr -> Fale Positive Rate
# # tpr -> True Positive Rate
# # Can ignore threhsold, I only included it beacuse the YouTube tutorial had it.
# fpr_lda, tpr_lda, threshold_lda = roc_curve(y_test, y_pred_lda)
# fpr_logistic, tpr_logistic, threshold_logistic = roc_curve(y_test, y_pred_logistic)
# fpr_svm, tpr_svm, threshold_svm = roc_curve(y_test, y_pred_svm)

# # Caclulates the area under the ROC curve for each of the models/
# auc_lda = auc(fpr_lda, tpr_lda)
# auc_logistic = auc(fpr_logistic, tpr_logistic)
# auc_svm = auc(fpr_svm, tpr_svm)


# # Plots the ROC curves.
# plt.figure()
# plt.plot(fpr_lda, tpr_lda, label = f"Fisher ( Area = {round(auc_lda,3)} )")
# plt.plot(fpr_logistic, tpr_logistic, label = f"Logistic Regression( Area = {round(auc_logistic,3)} )")
# plt.plot(fpr_svm, tpr_svm, label = f"Support Vector Machine( Area = {round(auc_svm,3)} )")

# plt.xlabel("False positive rate")
# plt.ylabel("True positive rate")

# plt.legend()
# plt.show()


