In [136]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.preprocessing import LabelEncoder
import nltk
import re

### Load the data from Kaggle.com

In [137]:
path_training = './data/twitter_training.csv'
path_val = './data/twitter_validation.csv'

In [142]:
column_names = ['twitter_id', 'entity', 'sentiment', 'text'] # Define column names for the dataset
training_df = pd.read_csv(path_training, names = column_names, header = None) # Read the training and validation datasets
training_df.text = training_df.text.astype(str)
test_df = pd.read_csv(path_val, names = column_names, header = None)

In [144]:
training_df = pd.concat([training_df, test_df], ignore_index=True)  #Append training and validation datasets together (we split them later)

In [145]:
training_df.dropna(inplace=True) # Remove NULL values in data

In [146]:
vectorizer = TfidfVectorizer(ngram_range=(1, 3)) 
# Initialize TF-IDF Vectorizer
# This vectorizer changes words into meaningful numbers that ML models can read, based on how often they appear in the text


In [147]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/thorsteinn.jonsson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/thorsteinn.jonsson/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [148]:
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

In [149]:
training_df.groupby(['sentiment']).count()['twitter_id']

sentiment
Irrelevant    13162
Negative      22808
Neutral       18603
Positive      21109
Name: twitter_id, dtype: int64

In [150]:
test_df.groupby(['sentiment']).count()['twitter_id']

sentiment
Irrelevant    172
Negative      266
Neutral       285
Positive      277
Name: twitter_id, dtype: int64

## Text Preprocessing
Lowercasing
Removing punctuation
Removing numbers
Removing stop words
Tokenization
Stemming/Lemmatization

In [151]:
def process_text(text):
    text = text.lower()
    text_nopunctiation = re.sub(r'[^\w\s]', '',text)
    text_nonumbers = re.sub(r'\d+', '', text_nopunctiation)

    words = text_nonumbers.split()
    filtered_text = []
    for word in words:
         if word not in stop_words:
              filtered_text.append(word)

    cleaned_text = ' '.join(filtered_text)
    
    tokens = word_tokenize(cleaned_text)
    stemmed = ' '.join([ps.stem(token) for token in tokens])
    return stemmed

In [152]:
training_df.text = training_df.text.apply(process_text)
training_df.text

0                                 im get borderland murder
1                                         come border kill
2                                   im get borderland kill
3                                im come borderland murder
4                                 im get borderland murder
                               ...                        
75677    toronto art cultur capit canada wonder want st...
75678    actual good move tot bring viewer one peopl go...
75679    today suck time drink wine n play borderland s...
75680            bought fraction microsoft today small win
75681    johnson johnson stop sell talc babi powder us ...
Name: text, Length: 75682, dtype: object

In [153]:
test_df.text = test_df.text.apply(process_text)

In [154]:
training_df['sentiment'].unique()

array(['Positive', 'Neutral', 'Negative', 'Irrelevant'], dtype=object)

In [155]:
tfidf = TfidfVectorizer()
encoder = LabelEncoder()

X = tfidf.fit_transform(training_df.text)
Y = encoder.fit_transform(training_df['sentiment'])

print(X.shape)
print(Y.shape)

(75682, 31816)
(75682,)


In [156]:
X_validation = tfidf.fit_transform(test_df.text)
Y_validation = encoder.fit_transform(test_df['sentiment'])

In [157]:
x_train, x_test, y_train, y_test = train_test_split(X, Y,  test_size=0.2)

In [158]:
print(x_train.shape)
print(y_train.shape)

(60545, 31816)
(60545,)


In [159]:
model = LogisticRegression(max_iter = 1000, multi_class='ovr')
model.fit(x_train, y_train)

In [160]:
y_pred = model.predict(x_test)

In [161]:
accuracy = accuracy_score(y_test, y_pred)

In [162]:
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.7347
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.57      0.67      2616
           1       0.75      0.81      0.78      4588
           2       0.68      0.71      0.69      3740
           3       0.73      0.78      0.75      4193

    accuracy                           0.73     15137
   macro avg       0.74      0.72      0.72     15137
weighted avg       0.74      0.73      0.73     15137

Confusion Matrix:
[[1502  357  346  411]
 [ 120 3715  448  305]
 [ 133  482 2643  482]
 [ 126  368  438 3261]]


In [165]:
model = RandomForestClassifier()

In [166]:
model.fit(x_train, y_train)

In [167]:
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.9074453326286582

### Pretty good accuracy, but hyper parameter tuning could make it better

In [174]:
# Create the random grid
param_grid = {'n_estimators': [100, 200, ],
               'max_features': ['auto', 'sqrt'],
               'max_depth': [2, 5, 10 ],
               'min_samples_split': [2, 5],
               'min_samples_leaf': [1, 2],
               'bootstrap': [True]}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='accuracy', verbose = 2)
grid_search.fit(x_train, y_train)

Fitting 3 folds for each of 64 candidates, totalling 192 fits
[CV] END bootstrap=True, max_depth=2, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END bootstrap=True, max_depth=2, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END bootstrap=True, max_depth=2, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END bootstrap=True, max_depth=2, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.0s
[CV] END bootstrap=True, max_depth=2, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.0s
[CV] END bootstrap=True, max_depth=2, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.0s
[CV] END bootstrap=True, max_depth=2, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 

96 fits failed out of a total of 192.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
96 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/homebrew/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/homebrew/lib/python3.11/site-packages/sklearn/base.py", line 1467, in wrapper
    estimator._validate_params()
  File "/opt/homebrew/lib/python3.11/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/opt/homebrew/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParamete

In [224]:
model_tuned = grid_search.best_estimator_


In [241]:
model_tuned = RandomForestClassifier()
model_tuned.class_weight = 'balanced'
model_tuned.classes_ = [0,1,2,3]
model_tuned.max_depth = None

In [242]:
model_tuned.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [238]:
model_tuned.fit(x_train, y_train)

In [239]:
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.9087005351126379

In [243]:
model = RandomForestClassifier()
model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [244]:
model.fit(x_train, y_train)

In [245]:
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.9079077756490718

### Let's try XGBoost

In [168]:
# Create DMatrix for XGBoost
dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_test, label=y_test)

# Define parameters for XGBoost
params = {
    'objective': 'multi:softmax',  # Specify multiclass classification
    'num_class': 4,                # Number of classes in the dataset
    'max_depth': 5,                # Maximum depth of the trees
    'eta': 0.5,                    # Learning rate
    'seed': 42                     # Random seed
}

# Train the model
num_rounds = 100
bst = xgb.train(params, dtrain, num_rounds)

# Make predictions
preds = bst.predict(dtest)

# Evaluate accuracy
accuracy = accuracy_score(y_test, preds)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.6570


## Accuracy is lower, probably because we haven't tuned the hyperparameters

In [88]:
xgb_clf = xgb.XGBClassifier(objective='multi:softmax', num_class=4)

# Create a parameter grid
param_grid = {
    'max_depth': [3],
    'learning_rate': [0.5, 0.3],
    'n_estimators': [100],
    'gamma': [0, 0.1, 0.3],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}
grid_search = GridSearchCV(estimator=xgb_clf, param_grid=param_grid, cv=3, scoring='accuracy', verbose = 0)
grid_search.fit(x_train, y_train)


KeyboardInterrupt: 