In [51]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import precision_recall_fscore_support
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [57]:
### Data loading preparation

def prepareDataClassification(data, train=True):
    # data cleaning
    data.drop(columns=['text', 'hashtags', 'user_mentions', 'hashtags', 'urls', 'id'], inplace=True)
    if train:
        data['bins'] = pd.cut(data['retweet_count'], bins=[-1,1,2,3,4,10,100,1000,10000,50000,100000,200000,500000,1000000], labels=[0,1,2,3,4,10,100,1000,10000,50000,100000,200000,500000])
        X = data.drop(['classif','retweet_count'], axis=1)
        y = data['classif']
        X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=data['bins'], test_size=0.3)
        X_train = X_train.drop(columns=['bins'])
        X_test = X_test.drop(columns=['bins'])
        return X_train, X_test, y_train, y_test
    else:
        return data

df = pd.read_csv('../../data/train_clean_final.csv')
df_eval = pd.read_csv('../../data/eval_clean_final.csv')

#Adding classification column
df['classif'] = pd.cut(df['retweet_count'], bins=[-1,4,1000000], labels=[0,1])

X_train, X_test, y_train, y_test = prepareDataClassification(df, True)
X_test_eval = prepareDataClassification(df_eval, False)

scaler = MinMaxScaler()
X_train_norm = scaler.fit_transform(X_train)
X_test_norm = scaler.transform(X_test)
X_eval_norm = scaler.transform(X_test_eval)

In [59]:
X_train.head()

Unnamed: 0,timestamp,user_verified,user_statuses_count,user_followers_count,user_friends_count,text_len,hour,nbr_user_mentions,nbr_hashtags,nbr_urls
170495,1588333060,0,4423,463,748,35,11,0,0,0
457485,1588332258,1,5534,12837,2567,148,11,0,0,2
488832,1588581149,0,8708,7025,3560,99,8,0,0,2
39304,1588296358,0,9250,197,1227,17,1,0,0,0
125203,1588458779,0,28148,123,350,146,22,1,0,2


### Random Forest Classifier

In [53]:
params = {'random_state':9, 'n_estimators':100}

In [54]:
rfc = RandomForestClassifier(**params, verbose=2)
rfc.fit(X_train_norm, y_train)

#Confusion matrix
y_pred=rfc.predict(X_test_norm)
matrix = confusion_matrix(y_test, y_pred)
matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]
plt.figure(figsize=(10,5))
sns.set(font_scale=1.4)
sns.heatmap(matrix, annot=True, annot_kws={'size':10},
            cmap=plt.cm.Greens, linewidths=0.2)
plt.savefig('results/Confusion_matri_RF_default')

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


building tree 1 of 100


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.8s remaining:    0.0s


building tree 2 of 100
building tree 3 of 100


KeyboardInterrupt: 

In [None]:
print('Accuracy:', accuracy_score(y_test, y_pred))
print('F1-score:', f1_score(y_test, y_pred, average=None))

### Gradient Boosting Classifier

In [None]:
gbc = GradientBoostingClassifier(random_state=0, verbose=2)
gbc.fit(X_train_norm, y_train) 

In [None]:
#Confusion matrix
y_pred=gbc.predict(X_test_norm)
matrix = confusion_matrix(y_test, y_pred)
matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]
plt.figure(figsize=(10,5))
sns.set(font_scale=1.4)
sns.heatmap(matrix, annot=True, annot_kws={'size':10},
            cmap=plt.cm.Greens, linewidths=0.2)
plt.savefig('results/Confusion_matri_GBC')

In [None]:
print('Accuracy:', accuracy_score(y_test, y_pred))
print('F1-score:', f1_score(y_test, y_pred, average=None))

### K-nearest neighbors classifier

In [None]:
neigh = KNeighborsClassifier(n_neighbors=2, weights='distance')
neigh.fit(X_train_norm, y_train)  #too long

In [None]:
#Confusion matrix
y_pred = neigh.predict(X_test_norm)
matrix = confusion_matrix(y_test, y_pred)
matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]
plt.figure(figsize=(10,5))
sns.set(font_scale=1.4)
sns.heatmap(matrix, annot=True, annot_kws={'size':10},
            cmap=plt.cm.Greens, linewidths=0.2)
plt.savefig('results/Confusion_matri_KNN')

In [None]:
print('Accuracy:', accuracy_score(y_test, y_pred))
print('F1-score:', f1_score(y_test, y_pred, average=None))

### Logistic regression (classifier)

In [None]:
lgc = LogisticRegression(random_state=9, verbose=1)
lgc.fit(X_train_norm, y_train)

#Confusion matrix
y_pred = lgc.predict(X_test_norm)
matrix = confusion_matrix(y_test, y_pred)
matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]
plt.figure(figsize=(10,5))
sns.set(font_scale=1.4)
sns.heatmap(matrix, annot=True, annot_kws={'size':10},
            cmap=plt.cm.Greens, linewidths=0.2)
plt.savefig('results/Confusion_matri_LGC')