In [48]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import nltk
import string 
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer


import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import precision_score, recall_score, roc_auc_score



In [25]:
df = pd.read_csv('https://raw.githubusercontent.com/campusx-official/jupyter-masterclass/main/tweet_emotions.csv')

df.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [26]:
df.drop(columns=['tweet_id'], inplace=True)
df

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...
...,...,...
39995,neutral,@JohnLloydTaylor
39996,love,Happy Mothers Day All my love
39997,love,Happy Mother's Day to all the mommies out ther...
39998,happiness,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...


In [27]:
final_df = df[df['sentiment'].isin(['happiness', 'sadness'])]

In [28]:
final_df

Unnamed: 0,sentiment,content
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
6,sadness,"I should be sleep, but im not! thinking about ..."
8,sadness,@charviray Charlene my love. I miss you
9,sadness,@kelcouch I'm sorry at least it's Friday?
...,...,...
39986,happiness,going to watch boy in the striped pj's hope i ...
39987,happiness,"gave the bikes a thorough wash, degrease it an..."
39988,happiness,"had SUCH and AMAZING time last night, McFly we..."
39994,happiness,Succesfully following Tayla!!


In [29]:
final_df.loc[:, 'sentiment'] = final_df['sentiment'].replace({"happiness": 1, "sadness": 0})


  final_df.loc[:, 'sentiment'] = final_df['sentiment'].replace({"happiness": 1, "sadness": 0})


In [30]:
final_df['sentiment'].value_counts()

sentiment
1    5209
0    5165
Name: count, dtype: int64

In [31]:
train_data, test_data = train_test_split(final_df, test_size=0.2, random_state=42)

In [32]:
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Zabih\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Zabih\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [33]:
def lemmatization(text):
    lemmatizater = WordNetLemmatizer()
    text = text.split()
    text = [lemmatizater.lemmatize(y) for y in text]
    return " ".join(text)
    

def remove_stop_words(text):
    text = "".join([i for i in text if not i.isdigit()])
    return text

def removing_numbers(text):
    text = text.split()
    text = [y.lower() for y in text]
    return " ".join(text)

def lower_case(text):
    text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,،-./:;<=>؟?@[\]^_`{|}~"""), ' ', text)
    text = text.replace(":", "")

    text = re.sub('\s+', ' ', text)
    text =  " ".join(text.split())
    return text.strip()

    

def removing_punctuations(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

def removing_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

def remove_small_sentences(df):
    for i in range(len(df)):
        if len(df.text.iloc[i].split()) < 3:
            df.text.iloc[i] = np.nan

def normalize_text(df):
    df.content=df.content.apply(lambda content : lower_case(content))
    df.content=df.content.apply(lambda content : remove_stop_words(content))
    df.content=df.content.apply(lambda content : removing_numbers(content))
    df.content=df.content.apply(lambda content : removing_punctuations(content))
    df.content=df.content.apply(lambda content : removing_urls(content))
    df.content=df.content.apply(lambda content : lemmatization(content))
    return df

def normalized_sentence(sentence):
    sentence= lower_case(sentence)
    sentence= remove_stop_words(sentence)
    sentence= removing_numbers(sentence)
    sentence= removing_punctuations(sentence)
    sentence= removing_urls(sentence)
    sentence= lemmatization(sentence)
    return sentence



  text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,،-./:;<=>؟?@[\]^_`{|}~"""), ' ', text)
  text = re.sub('\s+', ' ', text)


In [34]:
normalized_sentence("That's it? It's done already? This is one")

'that s it it s done already this is one'

In [35]:
train_data = normalize_text(train_data)
test_data = normalize_text(test_data)

In [36]:
train_data

Unnamed: 0,sentiment,content
23531,0,quot my problem isn t that i miss you cause i ...
8051,0,that s it it s done already this is one proof ...
11499,0,i am so hungry and there is no food for me to ...
31288,1,foot hurt finally in bed will not forget this ...
18561,0,really ill atm
...,...,...
21697,1,chocolatesuze yes yes you should especially wi...
19445,0,kickzfadayz our boy better get it in tonight o...
20216,1,tafe wa actually quite good for once
3258,0,minute to boarding hour to home no window seat


In [37]:
X_train = train_data['content'].values
y_train = train_data['sentiment'].values

X_test = test_data['content'].values
y_test = test_data['sentiment'].values

In [38]:
# Apply Bag of Words (CountVectorizer)
vectorizer = CountVectorizer()

# Fit the vectorizer on the training data and transform it
X_train_bow = vectorizer.fit_transform(X_train)

# Transform the test data using the same vectorizer
X_test_bow = vectorizer.transform(X_test)

In [39]:
train_df = pd.DataFrame(X_train_bow.toarray())

train_df['label'] = y_train
train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14244,14245,14246,14247,14248,14249,14250,14251,14252,label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
# Define and train the XGBoost model
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train_bow, y_train)

# Make predictions
y_pred = xgb_model.predict(X_test_bow)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [44]:
import numpy as np

y_test = np.array(y_test, dtype=int)  # or dtype=str, depending on your labels
y_pred = np.array(y_pred, dtype=int)  # ensure this matches the dtype of y_test


In [45]:

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)


In [46]:
accuracy

0.7845783132530121

In [49]:

classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)

Accuracy: 0.7845783132530121
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.73      0.78      1060
           1       0.75      0.84      0.79      1015

    accuracy                           0.78      2075
   macro avg       0.79      0.79      0.78      2075
weighted avg       0.79      0.78      0.78      2075



In [50]:
# Make predictions
y_pred = xgb_model.predict(X_test_bow)
y_pred_proba = xgb_model.predict_proba(X_test_bow)[:, 1]

# Calculate evaluation metrics
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)

In [51]:
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"AUC: {auc}")

Precision: 0.7508833922261484
Recall: 0.8374384236453202
AUC: 0.8694544102611769
