In [41]:
import pandas as pd
import numpy as np 
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.preprocessing import LabelEncoder
import nltk
import re

In [5]:
path_training = './data/twitter_training.csv'
path_test = './data/twitter_validation.csv'

In [6]:
column_names = ['twitter_id', 'entity', 'sentiment', 'text']
training_df = pd.read_csv(path_training, names = column_names, header = None)
training_df.text = training_df.text.astype(str)
test_df = pd.read_csv(path_test, names = column_names, header = None)

In [23]:
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 3))

In [8]:
nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/thorsteinn.jonsson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/thorsteinn.jonsson/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

## Text Preprocessing
Lowercasing
Removing punctuation
Removing numbers
Removing stop words
Tokenization
Stemming/Lemmatization

In [24]:
def process_text(text):
    text = text.lower()
    text_nopunctiation = re.sub(r'[^\w\s]', '',text)
    text_nonumbers = re.sub(r'\d+', '', text_nopunctiation)

    words = text_nonumbers.split()
    filtered_text = []
    for word in words:
         if word not in stop_words:
              filtered_text.append(word)

    cleaned_text = ' '.join(filtered_text)
    
    tokens = word_tokenize(cleaned_text)
    stemmed = ' '.join([ps.stem(token) for token in tokens])
    return stemmed




In [25]:
training_df.text = training_df.text.apply(process_text)
training_df.text

0                                 im get borderland murder
1                                         come border kill
2                                   im get borderland kill
3                                im come borderland murder
4                                 im get borderland murder
                               ...                        
74677    realiz window partit mac like year behind nvid...
74678    realiz mac window partit year behind nvidia dr...
74679    realiz window partit mac year behind nvidia dr...
74680    realiz window partit mac like year behind nvid...
74681    like window partit mac like year behind driver...
Name: text, Length: 74682, dtype: object

In [26]:
training_df['sentiment'].unique()


array(['Positive', 'Neutral', 'Negative', 'Irrelevant'], dtype=object)

In [27]:
tfidf = TfidfVectorizer(max_features=5000)
encoder = LabelEncoder()

X = tfidf.fit_transform(training_df.text)
Y = encoder.fit_transform(training_df['sentiment'])

print(X.shape)
print(Y.shape)

(74682, 5000)
(74682,)


In [28]:
x_train, x_test, y_train, y_test = train_test_split(X, Y,  test_size=0.2)

In [29]:
print(x_train.shape)
print(y_train.shape)

(59745, 5000)
(59745,)


In [35]:
model = LogisticRegression(max_iter= 1000, multi_class='multinomial')
model.fit(x_train, y_train)

In [36]:
y_pred = model.predict(x_test)

In [37]:
accuracy = accuracy_score(y_test, y_pred)

In [38]:
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.6731
Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.52      0.59      2604
           1       0.72      0.75      0.74      4610
           2       0.62      0.64      0.63      3710
           3       0.66      0.72      0.69      4013

    accuracy                           0.67     14937
   macro avg       0.67      0.66      0.66     14937
weighted avg       0.67      0.67      0.67     14937

Confusion Matrix:
[[1350  386  376  492]
 [ 183 3464  525  438]
 [ 264  540 2362  544]
 [ 195  405  535 2878]]


In [39]:
# Create DMatrix for XGBoost
dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_test, label=y_test)

# Define parameters for XGBoost
params = {
    'objective': 'multi:softmax',  # Specify multiclass classification
    'num_class': 4,                # Number of classes in the dataset
    'max_depth': 5,                # Maximum depth of the trees
    'eta': 0.1,                    # Learning rate
    'seed': 42                     # Random seed
}

# Train the model
num_rounds = 100
bst = xgb.train(params, dtrain, num_rounds)

# Make predictions
preds = bst.predict(dtest)

# Evaluate accuracy
accuracy = accuracy_score(y_test, preds)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.5382


In [46]:
xgb_clf = xgb.XGBClassifier(objective='multi:softmax', num_class=4)

# Create a parameter grid
param_grid = {
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.3],
    'n_estimators': [100, 200, 300],
    'gamma': [0, 0.1, 0.3],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}
grid_search = GridSearchCV(estimator=xgb_clf, param_grid=param_grid, cv=3, scoring='accuracy', verbose=1)
grid_search.fit(x_train, y_train)


Fitting 3 folds for each of 324 candidates, totalling 972 fits


In [None]:
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")


In [None]:
# Train with the best parameters
best_xgb = xgb.XGBClassifier(**best_params)
best_xgb.fit(x_train, y_train)
y_pred = best_xgb.predict(x_test)

# Evaluate the performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))