In [1]:
# --- Mount Google Drive (for Colab) ---
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [53]:
import os

base_dir = '/content/drive/MyDrive/NLP'
os.makedirs(base_dir, exist_ok=True)
train_path = os.path.join(base_dir, 'Dataset/train.csv')
test_path  = os.path.join(base_dir, 'Dataset/test.csv')

In [54]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import plotly
import sklearn
import tensorflow

In [55]:
df_train = pd.read_csv(train_path)
df_test  = pd.read_csv(test_path)

In [56]:
df_train.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [57]:
df_test.head()


Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative
3,01082688c6,happy bday!,positive
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive


In [58]:
print(df_train.shape)
df_test.shape

(27481, 4)


(3534, 3)

In [59]:
import cufflinks as cf
from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot
init_notebook_mode(connected=True)
cf.go_offline()

In [60]:
import nltk


In [61]:
from nltk.corpus import stopwords

In [62]:
print(df_train.columns)
print(df_test.columns)

Index(['textID', 'text', 'selected_text', 'sentiment'], dtype='object')
Index(['textID', 'text', 'sentiment'], dtype='object')


In [63]:
print(df_train.info())
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   textID         27481 non-null  object
 1   text           27480 non-null  object
 2   selected_text  27480 non-null  object
 3   sentiment      27481 non-null  object
dtypes: object(4)
memory usage: 858.9+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3534 entries, 0 to 3533
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   textID     3534 non-null   object
 1   text       3534 non-null   object
 2   sentiment  3534 non-null   object
dtypes: object(3)
memory usage: 83.0+ KB


In [64]:

print(df_train.isna().sum())

# only 2 null values out of 24000 total values.
# let's drop them.


df_test.isna().sum()

# No null values in test dataset.

textID           0
text             1
selected_text    1
sentiment        0
dtype: int64


Unnamed: 0,0
textID,0
text,0
sentiment,0


In [65]:
df_train.dropna(inplace=True)

In [66]:
df_train.isna().sum()

# No null values left.

Unnamed: 0,0
textID,0
text,0
selected_text,0
sentiment,0


In [67]:
# adding a column of text_length
df_train['text_length'] = df_train['text'].apply(lambda x : len(x))

df_test['text_length'] = df_test['text'].apply(lambda x : len(x))

In [68]:
import string

In [69]:
print(df_train['text'][4])
df_train['selected_text'][4].split()

 Sons of ****, why couldn`t they put them on the releases we already bought


['Sons', 'of', '****,']

In [70]:
def sel_tex(i):
    split_text = i.split()
    return split_text

In [71]:
df_train['selected_text2'] = df_train['selected_text'].apply(sel_tex)


In [72]:
df_train.head()

Unnamed: 0,textID,text,selected_text,sentiment,text_length,selected_text2
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,36,"[I`d, have, responded,, if, I, were, going]"
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,46,"[Sooo, SAD]"
2,088c60f138,my boss is bullying me...,bullying me,negative,25,"[bullying, me]"
3,9642c003ef,what interview! leave me alone,leave me alone,negative,31,"[leave, me, alone]"
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,75,"[Sons, of, ****,]"


In [26]:
!pip install emoji
!pip install contractions

Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.1-py3-none-any.whl (590 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/590.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.1
Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-

In [73]:
import re
import emoji
import contractions
import string
import nltk

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Make sure these are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

# Setup
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Preserve negators
for neg in ['not', 'no', 'never', 'none', 'nobody', 'nothing', "don't", "didn't", "can't", "won't"]:
    stop_words.discard(neg)

# Precompiled patterns
pattern_web     = re.compile(r"(([\w]+:)?//)?(([\d\w]|%[a-fA-F\d]{2})+(:([\d\w]|%[a-fA-f\d]{2})+)?@)?([\d\w][-\d\w]{0,253}[\d\w]\.)+[\w]{2,4}(:[\d]+)?(/([-+_~.\d\w]|%[a-fA-f\d]{2})*)*(\?(&?([-+_~.\d\w]|%[a-fA-f\d]{2})=?)*)?(#([-+_~.\d\w]|%[a-fA-f\d]{2})*)?")
pattern_email   = re.compile(r"(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*)@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)])")
pattern_hash    = re.compile(r'#(\w+)')
pattern_handle  = re.compile(r'@\w+')
pattern_repeat  = re.compile(r'([A-Za-z])\1{2,}')
pattern_num     = re.compile(r'[0-9]')
pattern_apos    = re.compile(r'[`’]')
# We'll strip all punctuation except our <EXC> and <QUES> tokens
# So exclude '<', '>', and letters in those tokens from the regex
punct_to_keep   = '<> '
all_punc        = set(string.punctuation) - set('<>')
pattern_punc    = re.compile(f"[{re.escape(''.join(all_punc))}]")
pattern_special = re.compile(r'[^a-zA-Z0-9\s<>]')
pattern_space   = re.compile(r'\s+')

def preprocess_text(tweet: str) -> str:
    # 1) Emoji → text
    tweet = emoji.demojize(str(tweet))

    # 2) URLs, emails, hashtags, handles
    tweet = pattern_web.sub(' ', tweet)
    tweet = pattern_email.sub(' ', tweet)
    tweet = pattern_hash.sub(r' \1', tweet)  # keep just the word of #tag
    tweet = pattern_handle.sub(' ', tweet)

    # 3) Normalize apostrophes + expand contractions
    tweet = pattern_apos.sub("'", tweet)
    tweet = contractions.fix(tweet)

    # 4) Map repeated sentiment punctuation
    tweet = re.sub(r'!{2,}', ' <EXC> ', tweet)
    tweet = re.sub(r'\?{2,}', ' <QUES> ', tweet)

    # 5) Character repeats, numbers, punctuation, special chars
    tweet = re.sub(r' <EXC> ', ' ', tweet)
    tweet = re.sub(r' <QUES> ', ' ', tweet)
    tweet = pattern_repeat.sub(r'\1', tweet)
    tweet = pattern_num.sub(' ', tweet)
    tweet = pattern_punc.sub(' ', tweet)
    tweet = pattern_special.sub(' ', tweet)

    # 6) Lowercase
    tweet = tweet.lower()

    # 7) Whitespace normalization
    tweet = pattern_space.sub(' ', tweet).strip()

    return tweet


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [74]:
# selected_text column of test dataset will bo on the basis of selected_text of Train dataset to
#    predict better for types of messages.

select_text = pd.Series(df_train['selected_text'].apply(preprocess_text))

for df in (df_train, df_test):
    df['text'] = df['text'].astype(str).apply(preprocess_text)

list1 = ' '.join(select_text)

list2 = list1.split()

In [75]:
def test_select(i):
    l  = [ ]
    for w in i.split():
        if w in list2:
            l.append(w)
    return(l)

In [76]:
df_test['selected_text'] = df_test['text'].apply(test_select)

In [77]:
df_test.head(6)

Unnamed: 0,textID,text,sentiment,text_length,selected_text
0,f87dea47db,last session of the day,neutral,49,"[last, session, of, the, day]"
1,96d74cb729,shanghai is also really exciting precisely sky...,positive,103,"[is, also, really, exciting, galore, good, twe..."
2,eee518ae67,recession hit veronique branquinho she has to ...,negative,78,"[recession, hit, she, has, to, quit, her, comp..."
3,01082688c6,happy birthday,positive,12,"[happy, birthday]"
4,33987a8ee5,i like it,positive,39,"[i, like, it]"
5,726e501993,that is great we visitors,positive,32,"[that, is, great, we]"


In [33]:
df_train.head(1)

Unnamed: 0,textID,text,selected_text,sentiment,text_length,selected_text2
0,cb774db0d1,i would have responded if i were going,"I`d have responded, if I were going",neutral,36,"[I`d, have, responded,, if, I, were, going]"


In [78]:
from sklearn.feature_extraction.text import CountVectorizer

# Fitting and Training the Model

In [79]:
bag_of_words = CountVectorizer(analyzer=test_select).fit(df_test['text'])

In [80]:
df_test_bow_trans = bag_of_words.transform(df_test['text'])

In [81]:
df_test_bow_trans


<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 40797 stored elements and shape (3534, 4461)>

In [82]:
from sklearn.feature_extraction.text import TfidfTransformer

In [83]:
tfidf = TfidfTransformer().fit(df_test_bow_trans)

In [84]:
df_test_tfidf = tfidf.transform(df_test_bow_trans)

In [85]:
df_test_tfidf.shape

(3534, 4461)

In [86]:
from sklearn.naive_bayes import MultinomialNB

In [87]:
from sklearn.metrics import confusion_matrix,classification_report

In [88]:
df_test.head()


Unnamed: 0,textID,text,sentiment,text_length,selected_text
0,f87dea47db,last session of the day,neutral,49,"[last, session, of, the, day]"
1,96d74cb729,shanghai is also really exciting precisely sky...,positive,103,"[is, also, really, exciting, galore, good, twe..."
2,eee518ae67,recession hit veronique branquinho she has to ...,negative,78,"[recession, hit, she, has, to, quit, her, comp..."
3,01082688c6,happy birthday,positive,12,"[happy, birthday]"
4,33987a8ee5,i like it,positive,39,"[i, like, it]"


In [89]:
df_test['text_length'] = df_test['text'].apply(lambda x : len(x))

In [90]:
def test_select(i):
    list_text = [text for text in i if text not in string.punctuation]
    join_test_text = ''.join(list_text)
    clean_test_text = [ text for text in join_test_text.split() if text.lower() not in stopwords.words('english')]
    return clean_test_text

In [91]:
df_test['selected_text'] = df_test['text'].apply(test_select)

In [92]:
df_test.head()

Unnamed: 0,textID,text,sentiment,text_length,selected_text
0,f87dea47db,last session of the day,neutral,23,"[last, session, day]"
1,96d74cb729,shanghai is also really exciting precisely sky...,positive,88,"[shanghai, also, really, exciting, precisely, ..."
2,eee518ae67,recession hit veronique branquinho she has to ...,negative,75,"[recession, hit, veronique, branquinho, quit, ..."
3,01082688c6,happy birthday,positive,14,"[happy, birthday]"
4,33987a8ee5,i like it,positive,9,[like]


In [93]:
bag_of_words = CountVectorizer(analyzer=test_select).fit(df_test['text'])


df_test_bow_trans = bag_of_words.transform(df_test['text'])


tfidf = TfidfTransformer().fit(df_test_bow_trans)


df_test_tfidf = tfidf.transform(df_test_bow_trans)

In [96]:
from sklearn.svm import SVC

In [97]:
sentiment_detect_model = SVC().fit(df_test_tfidf,df_test['sentiment'])

In [98]:
print(classification_report(df_test['sentiment'], all_sentiments_predictions))

              precision    recall  f1-score   support

    negative       0.98      0.75      0.85      1001
     neutral       0.79      0.97      0.87      1430
    positive       0.94      0.87      0.90      1103

    accuracy                           0.88      3534
   macro avg       0.90      0.86      0.88      3534
weighted avg       0.89      0.88      0.88      3534

