In [24]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

### Random Forest Model
All datasets below will be passed through a Stratified K-Fold and a Random Forest Model to test for accuracy.
The best performing training dataset will be used as the main dataset for tuning the Random Forest Model.

In [25]:
train_glove_0v = pd.read_csv('../numerical_datasets/train_data_mod_glove_50d_0v_numerical.csv')
test_glove_0v =  pd.read_csv('../numerical_datasets/test_data_mod_glove_50d_0v_numerical.csv')
train_glove_custom = pd.read_csv('../numerical_datasets/train_data_mod_glove_50d_custom_numerical.csv')
test_glove_custom =  pd.read_csv('../numerical_datasets/test_data_mod_glove_50d_custom_numerical.csv')
train_fasttext = pd.read_csv('../numerical_datasets/train_data_mod_fasttext_300d_numerical.csv')
test_fasttext =  pd.read_csv('../numerical_datasets/test_data_mod_fasttext_300d_numerical.csv')
train_word2vec = pd.read_csv('../numerical_datasets/train_data_mod_word2vec_50d_numerical.csv')
test_word2vec =  pd.read_csv('../numerical_datasets/test_data_mod_word2vec_50d_numerical.csv')

In [26]:
train_glove_0v.columns

Index(['id', 'target', 'keyword_encoded', 'tweet_length', 'punctuation_count',
       'embedding_0', 'embedding_1', 'embedding_2', 'embedding_3',
       'embedding_4', 'embedding_5', 'embedding_6', 'embedding_7',
       'embedding_8', 'embedding_9', 'embedding_10', 'embedding_11',
       'embedding_12', 'embedding_13', 'embedding_14', 'embedding_15',
       'embedding_16', 'embedding_17', 'embedding_18', 'embedding_19',
       'embedding_20', 'embedding_21', 'embedding_22', 'embedding_23',
       'embedding_24', 'embedding_25', 'embedding_26', 'embedding_27',
       'embedding_28', 'embedding_29', 'embedding_30', 'embedding_31',
       'embedding_32', 'embedding_33', 'embedding_34', 'embedding_35',
       'embedding_36', 'embedding_37', 'embedding_38', 'embedding_39',
       'embedding_40', 'embedding_41', 'embedding_42', 'embedding_43',
       'embedding_44', 'embedding_45', 'embedding_46', 'embedding_47',
       'embedding_48', 'embedding_49'],
      dtype='object')

In [27]:
features = ['keyword_encoded', 'tweet_length', 'punctuation_count',
       'embedding_0', 'embedding_1', 'embedding_2', 'embedding_3',
       'embedding_4', 'embedding_5', 'embedding_6', 'embedding_7',
       'embedding_8', 'embedding_9', 'embedding_10', 'embedding_11',
       'embedding_12', 'embedding_13', 'embedding_14', 'embedding_15',
       'embedding_16', 'embedding_17', 'embedding_18', 'embedding_19',
       'embedding_20', 'embedding_21', 'embedding_22', 'embedding_23',
       'embedding_24', 'embedding_25', 'embedding_26', 'embedding_27',
       'embedding_28', 'embedding_29', 'embedding_30', 'embedding_31',
       'embedding_32', 'embedding_33', 'embedding_34', 'embedding_35',
       'embedding_36', 'embedding_37', 'embedding_38', 'embedding_39',
       'embedding_40', 'embedding_41', 'embedding_42', 'embedding_43',
       'embedding_44', 'embedding_45', 'embedding_46', 'embedding_47',
       'embedding_48', 'embedding_49']

In [39]:
# Define the number of folds for K-fold cross-validation
num_folds = 5

# Instantiate the Random Forest classifier
rfc = RandomForestClassifier(random_state=42)

# Instantiate the Stratified K-fold cross-validator
skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)



Here we define the function that will be used to train and evaluate the model's accuracy based on the Stratified K-Fold.

In [40]:
def train_and_evaluate(rfc, X, y):
    # Initialize lists to store accuracy and F1 scores for each fold
    accuracy_scores = []
    f1_scores = []

    # Loop through each fold of the K-fold cross-validator
    for train_index, test_index in skf.split(X, y):
        # Split the data into training and validation sets for this fold
        X_train, X_val = X.iloc[train_index], X.iloc[test_index]
        y_train, y_val = y.iloc[train_index], y.iloc[test_index]

        # Train the Random Forest classifier on the training data
        rfc.fit(X_train, y_train)

        # Make predictions on the validation data
        y_pred = rfc.predict(X_val)

        # Calculate the validation accuracy score for this fold
        accuracy = accuracy_score(y_val, y_pred)
        accuracy_scores.append(accuracy)

        # Calculate the validation F1 score for this fold
        f1 = f1_score(y_val, y_pred, average='binary')  # Use 'weighted' for multi-class problems
        f1_scores.append(f1)

    # Calculate the mean and standard deviation of the validation accuracy and F1 scores
    mean_accuracy = np.mean(accuracy_scores)
    std_accuracy = np.std(accuracy_scores)
    mean_f1 = np.mean(f1_scores)
    std_f1 = np.std(f1_scores)

    # Print the mean and standard deviation of the validation accuracy and F1 scores
    print(f"Mean Accuracy: {mean_accuracy:.4f} +/- {std_accuracy:.4f}")
    print(f"Mean F1 Score: {mean_f1:.4f} +/- {std_f1:.4f}")



#### Accuracy of train_glove_0v

In [41]:
# Split the dataframe into features (X) and labels (y)
X = train_glove_0v[features]
y = train_glove_0v['target']

In [42]:
train_and_evaluate(rfc, X, y)

Mean Accuracy: 0.7447 +/- 0.0267
Mean F1 Score: 0.6730 +/- 0.0334


### Accuracy of train_glove_custom

In [43]:
X = train_glove_custom[features]
y = train_glove_custom['target']

In [44]:
train_and_evaluate(rfc, X, y)

Mean Accuracy: 0.7453 +/- 0.0337
Mean F1 Score: 0.6742 +/- 0.0400


### Accuracy of train_word2vec_50d

In [45]:
X = train_word2vec[features]
y = train_word2vec['target']


In [46]:
train_and_evaluate(rfc, X, y)

Mean Accuracy: 0.7408 +/- 0.0283
Mean F1 Score: 0.6691 +/- 0.0332


### Accuracy of train_fasttext_300d

In [47]:
train_fasttext_features = train_fasttext.drop(['id', 'target'], axis=1, inplace=False)

In [48]:
X = train_fasttext_features
y = train_fasttext['target']

In [49]:
train_and_evaluate(rfc, X, y)

Mean Accuracy: 0.7173 +/- 0.0195
Mean F1 Score: 0.6202 +/- 0.0301


### Dataset Evaluation

Based on the accuracy and F1-score, we can conclude that glove_50d_custom performed the best when passed into the Random Forest Classifier with default configuration. 
- Interestingly, the fasttext embedding model with 300 columns (6 times more than the glove and wordvec dataset), performs poorer with respect to the metrics used. This could mean that the embedding dimension chosen previously should be lower than 300 for better performance.

In [50]:
from sklearn.model_selection import RandomizedSearchCV

In [51]:
def optimize_hyperparameters(X, y):
    # Define the hyperparameter search space
    param_dist = {
        'n_estimators': [10, 50, 100, 200, 500],
        'max_depth': [None, 10, 20, 30, 40, 50],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['auto', 'sqrt'],
        'bootstrap': [True, False]
    }

    # Create a Random Forest Classifier instance
    rfc = RandomForestClassifier(random_state=42)

    # Create the RandomizedSearchCV instance
    random_search = RandomizedSearchCV(
        rfc, param_distributions=param_dist, n_iter=100, cv=5, scoring='f1', n_jobs=-1, random_state=42
    )

    # Fit the RandomizedSearchCV instance to the data
    random_search.fit(X, y)

    # Print the best combination of hyperparameters
    print("Best hyperparameters:", random_search.best_params_)

    # Return the best estimator
    return random_search.best_estimator_

In [52]:
# Using the glove_0v, glove_custom, word2vec dataset
X = train_glove_0v[features]
y = train_glove_0v['target']

In [53]:
best_rfc = optimize_hyperparameters(X, y)

In [None]:
train_and_evaluate(best_rfc, X, y)