# Total Gross Weights

In [1]:
# Read in data and store feature vectors in array X and labels in array y
import numpy as np
DATAgross = np.loadtxt('imdb_adjusted_total_discrete.csv', delimiter=',', skiprows=1)  # Read data from csv file
X = DATAgross[:, :-1]  # All columns except final column
y = DATAgross[:, -1]   # Final column is label
X.shape

(4286, 9)

In [2]:
# Split data into 80% training and 20% testing. Set random_state=0.

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [3]:
# Train linear regression model on training data.
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

scaler = StandardScaler()
scaler.fit(X_train)
Xtrainscaled = scaler.fit_transform(X_train)
Xtestscaled = scaler.transform(X_test)

regr = LinearRegression()  # Create LinearRegression instance
regr.fit(X_train, y_train) # Learn hypothesis, i.e., find w_0 and w_1 for best fitting line
print('Regression score on training data:\t' + str(regr.score(Xtrainscaled, y_train)))
print('Regression score on testing data:\t' + str(regr.score(Xtestscaled, y_test)))

Regression score on training data:	-1224.684905845779
Regression score on testing data:	-878.4941184496165


In [4]:
from sklearn.feature_selection import f_regression
print(f_regression(X_train, y_train)[1])

[1.32900835e-016 2.55477116e-011 7.54254675e-008 3.99657559e-004
 6.26708171e-128 4.88978231e-001 4.19461129e-002 1.01164930e-040
 0.00000000e+000]


Most significant: Director, followed by year, censor, and runtime

# Predictors for Gross

In [5]:
from sklearn import linear_model  # Using sklearn Perceptron and Logistics classifier
from sklearn import ensemble  # Using RandomForest classifier
from sklearn import neighbors  # Using nearest neighbors classifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler


learners = {'Perceptron': linear_model.Perceptron(max_iter=10, random_state =0),
            'RandomForest': ensemble.RandomForestClassifier(),
            'kNN': neighbors.KNeighborsClassifier(metric = "manhattan"), 
            'logistic': linear_model.LogisticRegression(C = 10,random_state=0), #C is for regularization, which prevents overfitting with logistic regression
            'SVM':SVC(C= 1000, gamma = 0.00001), 
            #C and gamma values vary based on the dataset. 
            #C controls the margin. larger Cs allow for less error. 
            #gamma controls how relevant the training data is. larger gamma values mean the training data has more weight, which can lead to overfitting.
            #these values were chosen because they led to higher accuracy. 
            'NeuralNetwork': MLPClassifier()
           }

#for neural networks, kNN, and SVM
scaler = StandardScaler()
scaler.fit(X_train)
Xtrainscaled = scaler.fit_transform(X_train)
Xtestscaled = scaler.transform(X_test)


for classifierName in learners:
    #this has an error, but I think we need to do feature scaling for these?
    if classifierName == 'NeuralNetwork' or classifierName =='kNN' or classifierName == 'SVM':
        #print(Xtrainscaled.shape)
        #print(y_train.shape)
        learners[classifierName].fit(Xtrainscaled, y_train)
        print('Accuracy of ' + classifierName + ':\t' + str(np.mean(cross_val_score(learners[classifierName], Xtrainscaled, y_train))))
    else:
        learners[classifierName].fit(X_train, y_train)
        print('Accuracy of ' + classifierName + ':\t' + str(np.mean(cross_val_score(learners[classifierName], X_train, y_train))))
        




Accuracy of Perceptron:	0.9959166648932773
Accuracy of RandomForest:	1.0
Accuracy of kNN:	0.9970832712647102


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP:

Accuracy of logistic:	0.9994169096209913
Accuracy of SVM:	0.9967917260752059
Accuracy of NeuralNetwork:	0.9967917260752059


# Rating Weights

In [6]:
# Read in data and store feature vectors in array X and labels in array y
import numpy as np

DATAratings = np.loadtxt('imdb_rating_disrete.csv', delimiter=',', skiprows=1)  # Read data from csv file
X = DATAratings[:, :-1]  # All columns except final column
y = DATAratings[:, -1]   # Final column is label


In [7]:
# Split data into 80% training and 20% testing. Set random_state=0.

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [8]:
# Train linear regression model on training data.

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(X_train)
Xtrainscaled = scaler.transform(X_train)
Xtestscaled = scaler.transform(X_test)

regr = LinearRegression()  # Create LinearRegression instance
regr.fit(Xtrainscaled, y_train) # Learn hypothesis, i.e., find w_0 and w_1 for best fitting line
print('Regression score on test data:\t' + str(regr.score(Xtestscaled, y_test)))
print('Regression score on train data:\t' + str(regr.score(Xtrainscaled, y_train)))

Regression score on test data:	0.6246102430275409
Regression score on train data:	0.6244903495344294


In [9]:
from sklearn.feature_selection import f_regression
print(f_regression(X_train, y_train)[1])

[1.20725647e-24 4.46222511e-07 8.08470550e-76 7.40286550e-01
 1.38238706e-01 3.16477985e-01 1.22318085e-06 8.58221903e-01
 0.00000000e+00]


In [10]:
from sklearn import linear_model  # Using sklearn Perceptron and Logistics classifier
from sklearn import ensemble  # Using RandomForest classifier
from sklearn import neighbors  # Using nearest neighbors classifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score

learners = {'Perceptron': linear_model.Perceptron(max_iter=10, random_state = 0),
            'RandomForest': ensemble.RandomForestClassifier(),
            'kNN': neighbors.KNeighborsClassifier(metric = "manhattan"), 
            'logistic': linear_model.LogisticRegression(C = 10, random_state=0),
            'SVM':SVC(C= 1000, gamma = 0.00001),
            'NeuralNetwork': MLPClassifier()
           }


scaler = StandardScaler()

scaler.fit(X_train)
Xtrainscaled = scaler.transform(X_train)
Xtestscaled = scaler.transform(X_test)


for classifierName in learners:
    if classifierName == 'NeuralNetwork' or classifierName =='kNN' or classifierName == 'SVM':
        learners[classifierName].fit(Xtrainscaled, y_train)
        print('Accuracy of ' + classifierName + ':\t' + str(np.mean(cross_val_score(learners[classifierName], Xtrainscaled, y_train))))
    else:
        learners[classifierName].fit(X_train, y_train)
        print('Accuracy of ' + classifierName + ':\t' + str(np.mean(cross_val_score(learners[classifierName], X_train, y_train))))
        
        



Accuracy of Perceptron:	0.6094005235045008
Accuracy of RandomForest:	1.0
Accuracy of kNN:	0.9329028962992914


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP:

Accuracy of logistic:	0.8389376689153242
Accuracy of SVM:	0.9889136217573578




Accuracy of NeuralNetwork:	0.9997084548104956




In [None]:

#does cross-val-score actually predict on testing data? We need to run the models on test data right? 