In [26]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv

from sklearn import datasets, ensemble
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

# RandomizedCV Search
from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt
from sklearn.model_selection import RandomizedSearchCV

# Models
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', 100)

### Tunning the Gradient Boosting Algorithm

###### Loading the training data

In [27]:
def prepareData(data, train=True):
    # data cleaning
    data.drop(columns=['text', 'hashtags', 'user_mentions', 'hashtags', 'urls', 'id'], inplace=True)
    if train:
        X = data.drop('retweet_count', axis=1)
        y = data['retweet_count'].to_numpy()
        return train_test_split(X, y, test_size=0.2)
    else:
        return data

In [29]:
df = pd.read_csv('../../data/train_clean_final.csv')
df = df.sample(150)
df_eval = pd.read_csv('../../data/eval_clean_final.csv')

X_train, X_test, y_train, y_test = prepareData(df)

#Evaluation dataset
X_test_eval = prepareData(df_eval, False)

# #StratifiedShuffleSplit(n_splits = 1)
# #X_train, X_test, y_train, y_test = scsplit(df, df['retweet_count'],stratify=df['retweet_count'], train_size=0.7, test_size=0.3)

###### Normalising the data

In [30]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

X_train_norm = scaler.fit_transform(X_train)

X_test_norm = scaler.transform(X_test)

X_eval_norm = scaler.transform(X_test_eval)

###### Seeting parameter grid our Gradient Booster Algorithm

In [31]:
model = GradientBoostingRegressor(loss='lad')

parameters = {'learning_rate': sp_randFloat(),
              'subsample'    : sp_randFloat(),
              #'n_estimators' : sp_randInt(100, 1000),
              'max_depth'    : sp_randInt(4, 10)}

In [32]:
randm = RandomizedSearchCV(estimator=model, 
                           param_distributions = parameters, 
                           scoring = 'neg_mean_absolute_error',
                           cv = 2, 
                           n_iter = 10, 
                           n_jobs=-1,
                           verbose=1)

randm.fit(X_train_norm, y_train)

Fitting 2 folds for each of 10 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    6.2s finished


RandomizedSearchCV(cv=2, estimator=GradientBoostingRegressor(loss='lad'),
                   n_jobs=-1,
                   param_distributions={'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001EBF47E9FA0>,
                                        'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001EBF47E99A0>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001EBF47E91C0>,
                                        'subsample': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001EBF47E9820>},
                   scoring='neg_mean_absolute_error', verbose=1)

In [34]:
#Store results in text file
with open('results/Optimization', 'w') as f:
    output = "\n========================================================"
    output += " Results from Random Search "
    output += "========================================================" 
    output += "\n The best estimator across ALL searched params:\n" + str(randm.best_estimator_)
    output += "\n The best score across ALL searched params:\n" + str(randm.best_score_)
    output += "\n The best parameters across ALL searched params:\n" + str(randm.best_params_) 
    output += "\n ========================================================"
    f.write(output)

## Training GradientBoostingRegressor

In [None]:
params = {...}

In [None]:
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(X_train_norm, y_train)

In [None]:
y_pred = reg.predict(X_train_norm)
mse = mean_absolute_error(y_train, y_pred)
print("The mean absolute error (MAE) on train set: {:.4f}".format(mse))

In [None]:
y_pred = reg.predict(X_test_norm)
mse = mean_absolute_error(y_test, y_pred)
print("The mean absolute error (MAE) on test set: {:.4f}".format(mse))
pd.Series(np.rint(y_pred)).value_counts()

In [None]:
test_score = np.zeros((params['n_estimators'],), dtype=np.float64)
for i, y_pred in enumerate(reg.staged_predict(X_test_norm)):
    test_score[i] = reg.loss_(y_test, y_pred)

fig = plt.figure(figsize=(6, 6))
plt.subplot(1, 1, 1)
plt.title('Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, reg.train_score_, 'b-',
         label='Training Set Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',
         label='Test Set Deviance')
plt.legend(loc='upper right')
plt.xlabel('Boosting Iterations')
plt.ylabel('Deviance')
fig.tight_layout()
plt.show()

In [None]:
# Predict the number of retweets for the evaluation dataset
y_pred = reg.predict(X_eval_norm)
y_pred = np.rint(y_pred)
# Dump the results into a file that follows the required Kaggle template
with open("class-regression_predictions.csv", 'w') as f:
    writer = csv.writer(f)
    writer.writerow(["TweetID", "NoRetweets"])
    for index, prediction in enumerate(y_pred):
        writer.writerow([str(eval_data['id'].iloc[index]) , str(int(prediction))])

In [None]:
pd.Series(y_pred).value_counts()

In [None]:
y_pred = reg.predict(X_test_norm)
pd.Series(np.rint(y_pred)).value_counts()

In [None]:
pd.DataFrame(X_test_norm).head()

### Gradient Boosting Regressor without classification

In [None]:
X = df[['user_verified', 'user_statuses_count', 'user_followers_count', 'user_friends_count', 'nbr_user_mentions', 'nbr_hashtags', 'nbr_urls', 'hour', 'date', 'text_len']]
y = df['retweet_count']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=13)

reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)

In [None]:
y_pred = reg.predict(X_test)
mse = mean_absolute_error(y_test, y_pred)
print("The mean absolute error (MAE) on test set: {:.4f}".format(mse))

In [None]:
eval_data = pd.read_csv("../../data/eval_clean_final_pred_3-classes.csv")
eval_data.drop('Unnamed: 0', axis=1, inplace=True)
X_val = eval_data[['user_verified', 'user_statuses_count', 'user_followers_count', 'user_friends_count', 'nbr_user_mentions', 'nbr_hashtags', 'nbr_urls', 'hour', 'date', 'text_len']]

# Predict the number of retweets for the evaluation dataset
y_pred = reg.predict(X_val)
print(y_pred)
y_pred = np.rint(y_pred)
# Dump the results into a file that follows the required Kaggle template
with open("class-regression_predictions.csv", 'w') as f:
    writer = csv.writer(f)
    writer.writerow(["TweetID", "NoRetweets"])
    for index, prediction in enumerate(y_pred):
        writer.writerow([str(eval_data['id'].iloc[index]) , str(int(prediction))])