In [2]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

### Random Forest Model
All datasets below will be passed through a `Stratified K-Fold` and a `Random Forest Model` to test for accuracy.
The best performing training dataset will be used as the main dataset for tuning the Random Forest Model.
#### Why are we using Random Forest Model for this project?
- `Handling high-dimensional data:`
    - Random Forest can efficiently handle high-dimensional data, which is suitable for text embeddings that often result in a large number of features (dimensions).
- `Robustness to noise:` 
    - Tweets may contain noise in the form of misspellings, slang, or abbreviations. Random Forest is robust to noise due to its ensemble nature, aggregating the output of multiple decision trees to make a final prediction.
- `Model interpretability:` 
    - Although not as interpretable as simple decision trees, Random Forest still provides some level of interpretability by calculating feature importances. This can help in understanding which dimensions of the embeddings are most important for the - classification task.
- `Reduced overfitting:`
    - Random Forest reduces the risk of overfitting by constructing multiple decision trees and combining their outputs. This ensemble approach leads to a more generalized model compared to a single decision tree.
- `Built-in feature selection:`
    - Random Forest inherently performs feature selection by considering a random subset of features for each tree. This can be particularly beneficial when working with high-dimensional data, such as text embeddings.
- `Ease of implementation: `
    - Random Forest is easy to implement using popular machine learning libraries like scikit-learn. It also requires minimal preprocessing and hyperparameter tuning, making it a good choice for a fast initial implementation.
- `Parallelism:`
    - The nature of Random Forest allows for parallel processing, which can significantly speed up training and prediction times, especially when dealing with large datasets.
- `Performance:`
    - Random Forest has been known to perform well in various classification tasks, making it a reliable choice for your binary classification problem.

In [3]:
train_glove_0v = pd.read_csv('../numerical_datasets/train_data_mod_glove_50d_0v_numerical.csv')
test_glove_0v =  pd.read_csv('../numerical_datasets/test_data_mod_glove_50d_0v_numerical.csv')
train_glove_custom = pd.read_csv('../numerical_datasets/train_data_mod_glove_50d_custom_numerical.csv')
test_glove_custom =  pd.read_csv('../numerical_datasets/test_data_mod_glove_50d_custom_numerical.csv')
train_fasttext = pd.read_csv('../numerical_datasets/train_data_mod_fasttext_300d_numerical.csv')
test_fasttext =  pd.read_csv('../numerical_datasets/test_data_mod_fasttext_300d_numerical.csv')
train_word2vec = pd.read_csv('../numerical_datasets/train_data_mod_word2vec_50d_numerical.csv')
test_word2vec =  pd.read_csv('../numerical_datasets/test_data_mod_word2vec_50d_numerical.csv')

In [4]:
train_glove_0v.columns

Index(['id', 'target', 'keyword_encoded', 'tweet_length', 'punctuation_count',
       'embedding_0', 'embedding_1', 'embedding_2', 'embedding_3',
       'embedding_4', 'embedding_5', 'embedding_6', 'embedding_7',
       'embedding_8', 'embedding_9', 'embedding_10', 'embedding_11',
       'embedding_12', 'embedding_13', 'embedding_14', 'embedding_15',
       'embedding_16', 'embedding_17', 'embedding_18', 'embedding_19',
       'embedding_20', 'embedding_21', 'embedding_22', 'embedding_23',
       'embedding_24', 'embedding_25', 'embedding_26', 'embedding_27',
       'embedding_28', 'embedding_29', 'embedding_30', 'embedding_31',
       'embedding_32', 'embedding_33', 'embedding_34', 'embedding_35',
       'embedding_36', 'embedding_37', 'embedding_38', 'embedding_39',
       'embedding_40', 'embedding_41', 'embedding_42', 'embedding_43',
       'embedding_44', 'embedding_45', 'embedding_46', 'embedding_47',
       'embedding_48', 'embedding_49'],
      dtype='object')

In [5]:
features = ['keyword_encoded', 'tweet_length', 'punctuation_count',
       'embedding_0', 'embedding_1', 'embedding_2', 'embedding_3',
       'embedding_4', 'embedding_5', 'embedding_6', 'embedding_7',
       'embedding_8', 'embedding_9', 'embedding_10', 'embedding_11',
       'embedding_12', 'embedding_13', 'embedding_14', 'embedding_15',
       'embedding_16', 'embedding_17', 'embedding_18', 'embedding_19',
       'embedding_20', 'embedding_21', 'embedding_22', 'embedding_23',
       'embedding_24', 'embedding_25', 'embedding_26', 'embedding_27',
       'embedding_28', 'embedding_29', 'embedding_30', 'embedding_31',
       'embedding_32', 'embedding_33', 'embedding_34', 'embedding_35',
       'embedding_36', 'embedding_37', 'embedding_38', 'embedding_39',
       'embedding_40', 'embedding_41', 'embedding_42', 'embedding_43',
       'embedding_44', 'embedding_45', 'embedding_46', 'embedding_47',
       'embedding_48', 'embedding_49']

In [6]:
# Define the number of folds for K-fold cross-validation
num_folds = 5

# Instantiate the Random Forest classifier
rfc = RandomForestClassifier(random_state=42)

# Instantiate the Stratified K-fold cross-validator
skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)



Here we define the function that will be used to train and evaluate the model's accuracy based on the Stratified K-Fold.

In [7]:
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
from sklearn.metrics import make_scorer, f1_score, accuracy_score


In [8]:
def train_and_evaluate(rfc, X, y):
    # Initialize lists to store accuracy and F1 scores for each fold
    accuracy_scores = []
    f1_scores = []

    # Loop through each fold of the K-fold cross-validator
    for train_index, test_index in skf.split(X, y):
        # Split the data into training and validation sets for this fold
        X_train, X_val = X.iloc[train_index], X.iloc[test_index]
        y_train, y_val = y.iloc[train_index], y.iloc[test_index]

        # Train the Random Forest classifier on the training data
        rfc.fit(X_train, y_train)

        # Make predictions on the validation data
        y_pred = rfc.predict(X_val)

        # Calculate the validation accuracy score for this fold
        accuracy = accuracy_score(y_val, y_pred)
        accuracy_scores.append(accuracy)

        # Calculate the validation F1 score for this fold
        f1 = f1_score(y_val, y_pred, average='binary')  # Use 'weighted' for multi-class problems
        f1_scores.append(f1)

    # Calculate the mean and standard deviation of the validation accuracy and F1 scores
    mean_accuracy = np.mean(accuracy_scores)
    std_accuracy = np.std(accuracy_scores)
    mean_f1 = np.mean(f1_scores)
    std_f1 = np.std(f1_scores)

    # Print the mean and standard deviation of the validation accuracy and F1 scores
    print(f"Mean Accuracy: {mean_accuracy:.4f} +/- {std_accuracy:.4f}")
    print(f"Mean F1 Score: {mean_f1:.4f} +/- {std_f1:.4f}")

In [9]:
def combined_scorer(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='binary')
    return acc * f1

### Let's discuss the hyperparameters in the RFC model

`criterion`: This hyperparameter determines the function used to measure the quality of a split when constructing the decision trees within the random forest. The two supported criteria in scikit-learn's RandomForestClassifier are:

`gini`: Gini impurity, which measures how often a randomly chosen element from the set would be incorrectly labeled if it was randomly labeled according to the distribution of labels in the subset. Lower Gini impurity indicates a better split.

`entropy`: Information gain, which is based on the concept of entropy from information theory. A split with higher information gain results in a more homogeneous child node, making it a better split.
There is no universally best criterion, as the choice of the criterion may depend on the specific problem and dataset. By including both options in the parameter space, the Bayesian optimization will determine the best criterion for your specific problem.

`max_features`: This hyperparameter controls the number of features to consider when looking for the best split in each decision tree within the random forest. Considering a subset of features introduces randomness and diversity in the trees, which can lead to a more generalized model. The options for this hyperparameter in scikit-learn's RandomForestClassifier are:

`auto`: Equivalent to sqrt, which means the square root of the total number of features will be considered at each split.

`sqrt`: The square root of the total number of features will be considered at each split.

`log2`: The base-2 logarithm of the total number of features will be considered at each split.
You can also specify an integer, which directly sets the number of features to consider at each split, or a float in the range (0, 1), which represents the fraction of features to consider.
Including all three categorical options in the parameter space allows the Bayesian optimization to find the best max_features value for your specific problem. You can also include the float and integer options if you want to explore a broader range of possibilities.

In [11]:
def train_and_evaluate_bayes_opt(X, y, skf):
    param_space = {
        'n_estimators': Integer(50, 500),
        'max_depth': Integer(3, 30),
        'min_samples_split': Integer(2, 10),
        'min_samples_leaf': Integer(1, 10),
        'criterion': Categorical(['gini', 'entropy']),
        'max_features': Categorical(['auto', 'sqrt', 'log2'])
    }
    custom_scorer = make_scorer(combined_scorer, greater_is_better=True)

    bayes_opt = BayesSearchCV(
        estimator=RandomForestClassifier(random_state=42),
        search_spaces=param_space,
        scoring=custom_scorer,
        n_iter=50,
        cv=skf,
        n_jobs=-1,
        random_state=42,
        verbose=1
    )

    bayes_opt.fit(X, y)

    print("Best hyperparameters:", bayes_opt.best_params_)
    print("Best combined score:", bayes_opt.best_score_)
    
    return bayes_opt.best_estimator_

#### Train_glove_0v


In [10]:
# Split the dataframe into features (X) and labels (y)
X = train_glove_0v[features]
y = train_glove_0v['target']

##### Before Hypertuning

In [11]:
train_and_evaluate(rfc, X, y)

Mean Accuracy: 0.7666 +/- 0.0170
Mean F1 Score: 0.7034 +/- 0.0204


##### Finding best Hyparameters for this model

In [13]:
train_and_evaluate_bayes_opt(X, y, skf)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi



Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

##### After Hypertuning

In [22]:
rfc_optimized = RandomForestClassifier(random_state=42, n_estimators=236, max_depth=23, min_samples_split=7, min_samples_leaf=4, criterion='entropy') 

In [23]:
train_and_evaluate(rfc_optimized, X, y)

Mean Accuracy: 0.7717 +/- 0.0187
Mean F1 Score: 0.7156 +/- 0.0221


In [24]:
# Try on test data
test_pred = rfc_optimized.predict(test_glove_0v[features])

# export to csv
df = pd.DataFrame({'id': test_glove_0v['id'], 'target': test_pred})

# Export the DataFrame to a CSV file
df.to_csv('test_results/predicted_target_glove_0d.csv', index=False)


### Accuracy of train_glove_custom

##### Before Hypertuning

In [18]:
X = train_glove_custom[features]
y = train_glove_custom['target']

In [17]:
train_and_evaluate(rfc, X, y)

Mean Accuracy: 0.7674 +/- 0.0167
Mean F1 Score: 0.7070 +/- 0.0186


##### Finding best Hyparameters for this model

In [18]:
best_rfc = train_and_evaluate_bayes_opt(X, y, skf)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

  warn(


Best hyperparameters: OrderedDict([('criterion', 'gini'), ('max_depth', 14), ('max_features', 'auto'), ('min_samples_leaf', 10), ('min_samples_split', 4), ('n_estimators', 156)])
Best combined score: 0.5527176864410794


##### After Hypertuning

In [25]:
rfc_optimized = RandomForestClassifier(random_state=42, n_estimators=156, max_depth=14, min_samples_split=4, min_samples_leaf=10, criterion='gini', max_features='auto') 

In [26]:
train_and_evaluate(rfc_optimized, X, y)

  warn(
  warn(
  warn(
  warn(
  warn(


Mean Accuracy: 0.7724 +/- 0.0197
Mean F1 Score: 0.7150 +/- 0.0233


In [27]:
# Try on test data
test_pred = rfc_optimized.predict(test_glove_custom[features])

# export to csv
df = pd.DataFrame({'id': test_glove_custom['id'], 'target': test_pred})

# Export the DataFrame to a CSV file
df.to_csv('test_results/predicted_target_glove_custom.csv', index=False)


### Accuracy of train_word2vec_50d

##### Before Hypertuning

In [28]:
X = train_word2vec[features]
y = train_word2vec['target']


In [27]:
train_and_evaluate(rfc, X, y)

Mean Accuracy: 0.7679 +/- 0.0163
Mean F1 Score: 0.7101 +/- 0.0192


##### Finding best Hyparameters for this model

In [28]:
best_rfc = train_and_evaluate_bayes_opt(X, y, skf)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

##### After Hypertuning

In [30]:
rfc_optimized = RandomForestClassifier(random_state=42, n_estimators=236, max_depth=23, min_samples_split=7, min_samples_leaf=4, criterion='entropy') 
train_and_evaluate(rfc_optimized, X, y)

Mean Accuracy: 0.7724 +/- 0.0149
Mean F1 Score: 0.7169 +/- 0.0177


In [31]:
# Try on test data
test_pred = rfc_optimized.predict(test_word2vec[features])

# export to csv
df = pd.DataFrame({'id': test_word2vec['id'], 'target': test_pred})

# Export the DataFrame to a CSV file
df.to_csv('test_results/predicted_target_word2vec.csv', index=False)

### Accuracy of train_fasttext_300d

##### Before Hypertuning

In [32]:
train_fasttext_features = train_fasttext.drop(['id', 'target'], axis=1, inplace=False)

In [33]:
X = train_fasttext_features
y = train_fasttext['target']

In [32]:
train_and_evaluate(rfc, X, y)

Mean Accuracy: 0.7566 +/- 0.0125
Mean F1 Score: 0.6842 +/- 0.0160


##### Finding best Hyparameters for this model

In [33]:
best_rfc = train_and_evaluate_bayes_opt(X, y, skf)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

##### After Hypertuning

In [34]:
rfc_optimized = RandomForestClassifier(random_state=42, n_estimators=500, max_depth=30, min_samples_split=2, min_samples_leaf=3, criterion='entropy') 
train_and_evaluate(rfc_optimized, X, y)

Mean Accuracy: 0.7647 +/- 0.0157
Mean F1 Score: 0.6949 +/- 0.0194


In [36]:
test_fasttext.columns

Index(['id', 'keyword_encoded', 'tweet_length', 'punctuation_count',
       'embedding_0', 'embedding_1', 'embedding_2', 'embedding_3',
       'embedding_4', 'embedding_5',
       ...
       'embedding_290', 'embedding_291', 'embedding_292', 'embedding_293',
       'embedding_294', 'embedding_295', 'embedding_296', 'embedding_297',
       'embedding_298', 'embedding_299'],
      dtype='object', length=304)

In [37]:
test_fasttext_no_id = test_fasttext.drop(['id'], axis=1, inplace=False)

In [38]:
# Try on test data
test_pred = rfc_optimized.predict(test_fasttext_no_id)

# export to csv
df = pd.DataFrame({'id': test_fasttext['id'], 'target': test_pred})

# Export the DataFrame to a CSV file
df.to_csv('test_results/predicted_target_fasttext.csv', index=False)

## Dataset Evaluation

Random Forest Classification Test Set Prediction Scores (from Kaggle submission): 
1. predicted_target_word2vec.csv: `0.75268` 
2. predicted_target_glove_0d.csv: `0.74532` 
3. predicted_target_glove_custom.csv: `0.74501` 
4. predicted_target_fasttext.csv: `0.74164` 
 

Based on prediction scores, we can conclude that `train_word2vec_50d.csv` performed the best in training the Random Forest Classifier with optimized configuration. 
Interestingly, `train_fasttext_300d.csv`  with 300 columns (6 times more than the glove and wordvec dataset), performed the poorest among all datasets. This means that having a higher embedding dimension may not necessarily mean better accuracy, and may make it worse instead.<br>
Even though `train_glove_custom.csv` had the highest validation accuracy, it did not perform the best for the test set. This could mean that the training data might be slightly overfitting the model, resulting in poorer performance to unseen data.