In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import math
# import ML related packages of sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [2]:
d=pd.read_csv('cleaned.csv')
x = d[['adult', 'belongs_to_collection', 'budget', 'genres',
       'original_language', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'runtime',
       'spoken_languages', 'vote_average', 'vote_count']]
y = d['revenue']

In [3]:
def lin_reg(x,y):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
    x_train=scaler.fit_transform(x_train)
    x_test=scaler.transform(x_test)
    rg = LinearRegression()
    modl = rg.fit(x_train,y_train)
    tpred = rg.predict(x_train)
    tr_squared = r2_score(y_train, tpred)
    pred = modl.predict(x_test)
    r_squared = r2_score(y_test, pred)
    print('\nLinear R Squared(train & test) : ', tr_squared, r_squared)

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
def rg_reg(x,y,a):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
    x_train=scaler.fit_transform(x_train)
    x_test=scaler.transform(x_test)
    rg = Ridge()
    parameter = {'alpha' : a}
    rg = GridSearchCV(rg, parameter, scoring = 'neg_mean_squared_error', cv=5)  
    modl = rg.fit(x_train,y_train)
    tr_squared = rg.best_score_
    a = rg.best_params_
    pred = modl.predict(x_test)
    r_squared = r2_score(y_test, pred)
    print('\nRidge R Squared(train & test) : ', tr_squared, r_squared, a)

In [5]:
from sklearn.linear_model import Lasso
def ls_reg(x,y,a):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
    x_train=scaler.fit_transform(x_train)
    x_test=scaler.transform(x_test)
    rg = Lasso(alpha = a)
    modl = rg.fit(x_train,y_train)
    tpred = modl.predict(x_train)
    tr_squared = r2_score(y_train, tpred)
    pred = modl.predict(x_test)
    r_squared = r2_score(y_test, pred)
    print('\nLasso R Squared(train & test) : ', tr_squared, r_squared)

In [6]:
from sklearn.neighbors import KNeighborsRegressor
def knn(x,y,a):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
    x_train=scaler.fit_transform(x_train)
    x_test=scaler.transform(x_test)
    rg = KNeighborsRegressor()
    parameter = {'n_neighbors' : a}
    rg = GridSearchCV(rg, parameter, scoring = 'neg_mean_squared_error', cv=10)
    modl = rg.fit(x_train,y_train)
    tpred = modl.predict(x_train)
    tr_squared = r2_score(y_train, tpred)
    a = rg.best_params_
    pred = modl.predict(x_test)
    r_squared = r2_score(y_test, pred)
    print('\nKNN R Squared(train & test) : ', tr_squared, r_squared, a )

In [7]:
from xgboost import XGBRegressor as xgbr
from sklearn.metrics import mean_squared_error as MSE
def xb(x,y):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
    x_train=scaler.fit_transform(x_train)
    x_test=scaler.transform(x_test)
    xg = xgbr(objective ='reg:linear', n_estimators = 233, seed = 42, verbosity = 0)
    xg.fit(x_train, y_train)
    tpred = xg.predict(x_test)
    rmse = np.sqrt(MSE(y_test, tpred))
    pred = xg.predict(x_test)
    r_squared = r2_score(y_test, pred)
    print('\nXGB R Squared(train & test) : ', rmse, r_squared)

In [14]:
lin_reg(x,y)
rg_reg(x,y,[0.01,0.1,1,10,100,1000])
ls_reg(x,y,0.1)
knn(x,y,[1,2,3,4,5,6,7,8])
xb(x,y)


Linear R Squared(train & test) :  0.741040077511971 0.7161161705123373

Ridge R Squared(train & test) :  -5375628393360183.0 0.7140127098281148 {'alpha': 100}

Lasso R Squared(train & test) :  0.741040077511971 0.7161161703669944

KNN R Squared(train & test) :  0.8030796255856893 0.7295638009179963 {'n_neighbors': 7}

XGB R Squared(train & test) :  72824537.65029554 0.7968153174967331


In [3]:
# split the data as train and test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [4]:
lr = LinearRegression()
modl = lr.fit(x_train,y_train)
pred = modl.predict(x_test)
score = modl.score(x_test, y_test)
print(score)

0.7161161704592646


In [65]:
cross_val_score(lr,x,y)

array([0.53194644, 0.73013687, 0.77078387, 0.76539571, 0.75046103])