In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv

# Data Preparation
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

#Models
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor

#Metrics
from sklearn.metrics import mean_absolute_error

## Loading and preparing dataset

In [3]:
def prepareData(data, train=True):
    # data cleaning
    data.drop(columns=['text', 'hashtags', 'user_mentions', 'hashtags', 'urls', 'id'], inplace=True)
    if train:
        X = data.drop('retweet_count', axis=1)
        y = data['retweet_count'].to_numpy()
        return train_test_split(X, y, test_size=0.2)
    else:
        return data
    
df = pd.read_csv('../../data/train_clean_final.csv')
df_eval = pd.read_csv('../../data/eval_clean_final.csv')

X_train, X_test, y_train, y_test = prepareData(df)
X_test_eval = prepareData(df_eval, False)

In [4]:
scaler = MinMaxScaler()
X_train_norm = scaler.fit_transform(X_train)
X_test_norm = scaler.transform(X_test)
X_eval_norm = scaler.transform(X_test_eval)

In [5]:
results = []

## Testing different regressors without tunning

In [None]:
# Global metric for all ours algorithms
def metrics(Xtrain, Xtest, ytrain, ytest, model, model_name):
    #Train loss
    y_pred_train = np.rint(model.predict(Xtrain))
    mse_train = mean_absolute_error(ytrain, y_pred_train)
    #Test loss
    y_pred_test = np.rint(model.predict(Xtest))
    mse_test = mean_absolute_error(ytest, y_pred_test)

    output = "\n========================"
    output += f" Results for {model_name} "
    output += "========================" 
    output += "\n MAE on TRAIN set: {:.4f}".format(mse_train) 
    output += "\n MAE on TEST set: {:.4f}".format(mse_test)
    output += "\n ================================================ \n "
    return output

#### Gradient Boosting Regressor

In [None]:
gbr = GradientBoostingRegressor(loss='lad')
gbr.fit(X_train_norm, y_train)
result_gbr = metrics(X_train_norm, X_test_norm, y_train, y_test, gbr, "Gradient Boosting Regressor")
results.append(result_gbr)

#### Random Forest Regressor

In [7]:
rfr = RandomForestRegressor(verbose=2)
rfr.fit(X_train_norm, y_train)
result_rfr = metrics(X_train_norm, X_test_norm, y_train, y_test, rfr, "Random Forest Regressor")
results.append(result_rfr)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


building tree 1 of 100


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.3s remaining:    0.0s


building tree 2 of 100


KeyboardInterrupt: 

#### Store the results in text file

In [None]:
#Store results in text file
with open('results/regression-comparison', 'w') as f:
    for res in result_gbr:
        f.write(res)