In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor, VotingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
import sklearn.neighbors

from sklearn.svm import SVC

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from sklearn.svm import LinearSVR 
from sklearn import neighbors

from scipy import stats
from sklearn.preprocessing import PolynomialFeatures


In [None]:
def compare_models(df, target_col):
    
    # assuming your dataframe is named 'df' and our column we want to predict is 'Rating' column
    X = df.drop(target_col, axis=1)
    y = df[target_col]
    
    # Splitting the df to train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Model 1 - Linear Regression

    ## Errorif y_type in ["binary", "multiclass"]:  Because I tried to predict y = df[['Rating', 'Profit_inf']]at the same time. 
    #ValueError: continuous-multioutput is not supported

    # scaling w RobustScaler object and fit to training data
    scaler = RobustScaler()
    scaler.fit(X_train)

    # apply the scaler to both the training and testing data
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    lr = LinearRegression()
    lr.fit(X_train_scaled, y_train)

    y_pred =lr.predict(X_test_scaled)

    # evaluate the model performance using mean absolute error and mean squared error and RMSE
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    RMSE = np.sqrt(mse)
    R2 = r2_score(y_test, y_pred)
    
    lin_reg = (mae, mse, RMSE, R2)
    
    # Model 2 - SVR (Support Vector Machine Regressor)

    ##SVR performs better on regression problems whereas SVM on classification problems. Therfore we continue with SVR 

    #Scaling the data for SVM model
    scaler = RobustScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    #fitting the training data for SVM model
    svm_reg= LinearSVR(epsilon=1.5)

    svm_reg.fit(X_train_scaled, y_train)

    y_pred = svm_reg.predict(X_test_scaled)

    # Performance metrics for SVR
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    RMSE = np.sqrt(mse)
    R2 = r2_score(y_test, y_pred)

    svr = (mae, mse, RMSE, R2)
    
    # Model 3 - Decision Trees

    #Voting and Bagging regressors on Ensemble methods help the models to reduce overfitting therefore we apply with Decision Tree

    # Define a decision tree model
    dtree = DecisionTreeRegressor(max_depth=3)

    dtree.fit(X_train, y_train) 

    # Define a bagging regressor with decision tree models
    bagging_model = BaggingRegressor(base_estimator=dtree, n_estimators=10, random_state=42)

    #Approach is to use the same training algorithm for every predictor and train them on different random subsets of the training set

    # Define a voting regressor with decision tree models
    voting_model = VotingRegressor([('tree1', dtree), ('tree2', dtree), ('tree3', dtree)])

    # Fit the models on the training data
    dtree.fit(X_train, y_train)
    bagging_model.fit(X_train, y_train)
    voting_model.fit(X_train, y_train)

    # Make predictions on the testing data
    y_pred_dtree = dtree.predict(X_test)
    y_pred_bagging = bagging_model.predict(X_test)
    y_pred_voting = voting_model.predict(X_test)
    
    mae_dtree = mean_absolute_error(y_test, y_pred_dtree)
    mae_bagging = mean_absolute_error(y_test, y_pred_bagging)
    mae_voting = mean_absolute_error(y_test, y_pred_voting)

    # Calculate the mean squared error of the predictions
    mse_dtree = mean_squared_error(y_test, y_pred_dtree)
    mse_bagging = mean_squared_error(y_test, y_pred_bagging)
    mse_voting = mean_squared_error(y_test, y_pred_voting)

    rmse_dtree = np.sqrt(mse_dtree)
    rmse_bag= np.sqrt(mse_bagging)
    rmse_voting = np.sqrt(mse_voting)

    R2_dtree = r2_score(y_test, y_pred_dtree)
    R2_bagging = r2_score(y_test, y_pred_bagging)
    R2_voting = r2_score(y_test, y_pred_voting)

    tree = (mae_dtree, mse_dtree, rmse_dtree, R2_dtree)
    tree_bag = (mae_bagging, mse_bagging, rmse_bag, R2_bagging)
    tree_vot = (mae_voting, mse_voting, rmse_bag, R2_voting)

    # Cross validation , cv on Decision tree model 
    
    # Model 4 - Random Forest

    # Initialize the model
    rf = RandomForestRegressor(n_estimators=500, max_leaf_nodes=16, n_jobs=-1, random_state=42)

    #training model to Rfor.
    rf.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = rf.predict(X_test)

    # evaluate the model performance using mean absolute error and mean squared error and RMSE
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    rf = (mae, mse, rmse, r2)
          
    # Model 5 - K-nearest neighbors
    
    # Train the KNN model
    k = 5
    knn = neighbors.KNeighborsRegressor(n_neighbors =k)
    knn.fit(X_train, y_train)

    scaler=RobustScaler()
    scaler.fit(X_train)

    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Make predictions on the test set
    y_pred = knn.predict(X_test)

    #Model performance metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    knn = (mae, mse, rmse, r2)

    #if model performs good and generalises good , why negativ R2??
    
    # Model 6 - Lasso Regression
    # Lasso regression can help with feature selection by shrinking the coefficients of less important features to zero.

    # scaling the data
    scaler = RobustScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # initialize lasso regression model
    lasso = Lasso(alpha=0.1)

    # fit the model to the training data
    lasso.fit(X_train_scaled, y_train)

    # make predictions on the test set
    y_pred = lasso.predict(X_test_scaled)

    # evaluate the model performance using mean absolute error and mean squared error and RMSE
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    RMSE = np.sqrt(mse)
    R2 = r2_score(y_test, y_pred)

    lasso_reg = (mae, mse, RMSE, R2)
    
    # Model 7 - Polynomial
    # Drop target column in df
    X = df.drop(target_col, axis=1)
    y = df[target_col]

    # Convert X to a numpy array before reshaping
    X_array = X.values.reshape(-1, 1)

    poly= PolynomialFeatures(degree=2, include_bias= False)

    X_poly = poly.fit_transform(X_array)

    #print(X_poly.shape)

    #splitting train, test val_set:
    # train set=0.8, test set=0.2, val set= 0.8*0.25= 0.2

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

    # scaling w RobustScaler object and fit only to training data
    scaler = RobustScaler()

    # Apply scaler to training, validation, and test data
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)

    lr = LinearRegression()
    lr.fit(X_train_scaled, y_train)

    lr.intercept_, lr.coef_ 

    y_pred_val = lr.predict(X_val_scaled)
    y_pred_test = lr.predict(X_test_scaled)

    #predictions on val set
    mse = mean_squared_error(y_val, y_pred_val)
    RMSE = np.sqrt(mse)

    #print("Validation set MSE: {:.2f}".format(mse))
    #print(f'RMSE: {RMSE}')

    #predictions on test set
    mse = mean_squared_error(y_test, y_pred_test)
    RMSE = np.sqrt(mse)

    #print("Test set MSE: {:.2f}".format(mse))
    #print(f'RMSE: {RMSE}')

    # Visualising the Polynomial Regression: edit

    # got error code 
    #plt.scatter(X_array, y, color = 'blue')

    #plt.plot(X, lg.predict(poly.fit_transform(X)), color = 'violet')
    #plt.title('Polynomial Regression')
    #plt.xlabel('X')
    #plt.ylabel('ploy_predicted')
    
    pol = ("N/A", mse, RMSE, "N/A")
    

    models = lin_reg, svr, tree, tree_bag, tree_vot, rf, knn, lasso_reg, pol
    # create a dictionary of data
    headers = ['Model', 'Mae', 'Mse', 'RMSE', 'R2']
    models_name = ['Linear Reg', 'SVR', 'Descion Tree', 'DT bag', 'DT vot', 'RF', 'KNN', 'Lasso', 'Polynomial']

    data = {headers[0]: [models_name[0], models_name[1], models_name[2], models_name[3], models_name[4], models_name[5], models_name[6], models_name[7], models_name[8]],
            headers[1]: [models[0][0], models[1][0], models[2][0], models[3][0], models[4][0], models[5][0], models[6][0], models[7][0], models[8][0]],
            headers[2]: [models[0][1], models[1][1], models[2][1], models[3][1], models[4][1], models[5][1], models[6][1], models[7][1], models[8][1]],
            headers[3]: [models[0][2], models[1][2], models[2][2], models[3][2], models[4][2], models[5][2], models[6][2], models[7][2], models[8][2]],
            headers[4]: [models[0][3], models[1][3], models[2][3], models[3][3], models[4][3], models[5][3], models[6][3], models[7][3], models[8][3]]}

    # create a DataFrame from the dictionary
    df_models = pd.DataFrame(data)
    
    return df_models

# Compare models and print result    
df=pd.read_csv('./data/mvoies_processed_noTitle.csv')
target_col = "Rating"

models = compare_models(df, target_col)
models



In [None]:
# Load data
df = pd.read_csv('./data/mvoies_processed_noTitle.csv')

def drop_and_compare_single(df, prefixes, target_col):
    for prefix in prefixes:
        for col in df.columns:
            if col.startswith(prefix):
                df_drop = df.drop(col, axis=1)
                print("Dropped column:", col)
                models = compare_models(df_drop, target_col=target_col)
                print(models)

def drop_and_compare_cluster(df, col_prefix, target_col):
    cols_to_drop = [col for col in df.columns if col.startswith(col_prefix)]
    df_drop = df.drop(cols_to_drop, axis=1)
    print("Dropped columns:", "Starts with: " + col_prefix)
    models = compare_models(df_drop, target_col=target_col)
    print(models)

target_col = "Rating"

# Dropping each column one by one and running the models
drop_and_compare_single(df, ['Year', 'Month', 'Runtime', 'Budget_inf', 'Income_inf', 'Profit_inf', 'ROI_inf'], target_col)

# Dropping all columns starting with prefix and running the models
cols = ['cert', 'genre', 'contient', 'top_50_director', 'top_1000_Stars']
for col in cols:
    print(drop_and_compare_cluster(df, col, target_col))
