In [1]:
# Importing necessary libraries
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pandas as pd
import re

# Download NLTK resources
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

# Load your dataset (replace the file path with your actual file)
df = pd.read_csv("https://github.com/suhasmaddali/Twitter-Sentiment-Analysis/raw/refs/heads/main/train.csv")

# Ensure all non-string values are handled
df['selected_text'] = df['selected_text'].fillna('').astype(str)

# Step 1: Cleaning the text
cleaned = []
for text in df['selected_text']:
    cleaned_text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation and special characters
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Replace multiple spaces with a single space
    cleaned.append(cleaned_text.strip())  # Strip leading/trailing spaces

# Step 2: Tokenizing the cleaned text
tokens = [word_tokenize(x) for x in cleaned]

# Step 3: Removing stopwords
stop = set(stopwords.words('english'))
stpktn = [[word for word in sentence if word not in stop] for sentence in tokens]

# Step 4: Displaying results
print("Cleaned Text:", cleaned[:5])  # Display first 5 cleaned texts
print("Tokens:", tokens[:5])  # Display tokens for the first 5 cleaned texts
print("Tokens without Stopwords:", stpktn[:5])  # Display tokens without stopwords for the first 5 cleaned texts




[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mohammadbilalahmed/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/mohammadbilalahmed/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mohammadbilalahmed/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Cleaned Text: ['Id have responded if I were going', 'Sooo SAD', 'bullying me', 'leave me alone', 'Sons of']
Tokens: [['Id', 'have', 'responded', 'if', 'I', 'were', 'going'], ['Sooo', 'SAD'], ['bullying', 'me'], ['leave', 'me', 'alone'], ['Sons', 'of']]
Tokens without Stopwords: [['Id', 'responded', 'I', 'going'], ['Sooo', 'SAD'], ['bullying'], ['leave', 'alone'], ['Sons']]


In [3]:
from sklearn.feature_extraction.text import CountVectorizer


In [4]:
cv=CountVectorizer()

In [5]:
#importing multinomialnb
from sklearn.naive_bayes import MultinomialNB

In [6]:
print(df.columns)


Index(['textID', 'text', 'selected_text', 'sentiment'], dtype='object')


In [7]:
y=df['selected_text']

In [8]:
y

0                      I`d have responded, if I were going
1                                                 Sooo SAD
2                                              bullying me
3                                           leave me alone
4                                            Sons of ****,
                               ...                        
27476                                               d lost
27477                                        , don`t force
27478                            Yay good for both of you.
27479                           But it was worth it  ****.
27480    All this flirting going on - The ATG smiles. Y...
Name: selected_text, Length: 27481, dtype: object

In [9]:
mb= MultinomialNB()

In [10]:
from nltk.stem import PorterStemmer
ps=PorterStemmer()

In [11]:
ps.stem(df['selected_text'][0])

'i`d have responded, if i were go'

In [13]:
stemed_data=[]
for message in stpktn:
    stem=[ps.stem(word) for word in message]
    stemed_data.append(stem)

In [14]:
stemed_data

[['id', 'respond', 'i', 'go'],
 ['sooo', 'sad'],
 ['bulli'],
 ['leav', 'alon'],
 ['son'],
 ['httpwwwdothebouncycomsmf',
  'shameless',
  'plug',
  'best',
  'ranger',
  'forum',
  'earth'],
 ['fun'],
 ['soooo', 'high'],
 ['both'],
 ['wow', 'u', 'becam', 'cooler'],
 ['much',
  'love',
  'hope',
  'reckon',
  'chanc',
  'minim',
  'p',
  'im',
  'never',
  'gon',
  'na',
  'get',
  'cake',
  'stuff'],
 ['like'],
 ['danger'],
 ['lost'],
 ['test', 'test', 'lg', 'env2'],
 ['uh', 'oh', 'i', 'sunburn'],
 ['sigh'],
 ['sick'],
 ['onna'],
 ['he'],
 ['oh', 'marli', 'im', 'sorri', 'i', 'hope', 'find', 'soon', '3', '3'],
 ['interest'],
 ['clean', 'hous', 'famili', 'com', 'later', 'today'],
 ['got',
  'ta',
  'restart',
  'comput',
  'i',
  'thought',
  'win7',
  'suppos',
  'put',
  'end',
  'constant',
  'rebooti'],
 ['see',
  'wat',
  'i',
  'mean',
  'bout',
  'foll0w',
  'friiday',
  'it',
  'call',
  'lose',
  'f0llower',
  'friday',
  'smh'],
 ['free', 'fillin', 'app', 'ipod', 'fun', 'im', 'a

In [15]:
stem_vec=[' '.join(message) for message in stemed_data]

In [16]:
stem_vec

['id respond i go',
 'sooo sad',
 'bulli',
 'leav alon',
 'son',
 'httpwwwdothebouncycomsmf shameless plug best ranger forum earth',
 'fun',
 'soooo high',
 'both',
 'wow u becam cooler',
 'much love hope reckon chanc minim p im never gon na get cake stuff',
 'like',
 'danger',
 'lost',
 'test test lg env2',
 'uh oh i sunburn',
 'sigh',
 'sick',
 'onna',
 'he',
 'oh marli im sorri i hope find soon 3 3',
 'interest',
 'clean hous famili com later today',
 'got ta restart comput i thought win7 suppos put end constant rebooti',
 'see wat i mean bout foll0w friiday it call lose f0llower friday smh',
 'free fillin app ipod fun im addict',
 'im sorri',
 'internet',
 'fun',
 'power back work',
 'quiteheavenli',
 'hope',
 'well much unhappi 10 minut',
 'funni',
 'ahhh i slept game im gon na tri best watch tomorrow though i hope play armi',
 'that end tear fear',
 'miss',
 'case wonder realli busi today come ad ton new blog updat stay tune',
 'soooooo sleeeeepi',
 'a littl happi fo',
 'car happ

In [17]:

x_vec=cv.fit_transform(stem_vec).toarray()


In [18]:
x_vec

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [19]:
len(x_vec[0])

15592

In [20]:
y=df['sentiment']

In [21]:
y

0         neutral
1        negative
2        negative
3        negative
4        negative
           ...   
27476    negative
27477    negative
27478    positive
27479    positive
27480     neutral
Name: sentiment, Length: 27481, dtype: object

In [22]:
mb= MultinomialNB()

In [23]:
mb.fit(x_vec,y)

In [24]:
x_vec[0]

array([0, 0, 0, ..., 0, 0, 0])

In [25]:
df['selected_text'][0]

'I`d have responded, if I were going'

In [26]:
mb.predict([x_vec[0]])

array(['neutral'], dtype='<U8')

In [27]:
import joblib
joblib.dump(mb,'model.pkl')

['model.pkl']