In [2]:
import numpy as np
import pandas as pd
import re
from  nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from  sklearn.metrics import accuracy_score

In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# Printing stop words in english
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

# Data Pre-Processing 

In [5]:
#loading dataset into dataframe 
twitter_data = pd.read_csv('twitter_data.csv',encoding= 'ISO-8859-1')

In [6]:
#checking the number of tweets in dataframe
twitter_data.shape

(1599999, 6)

In [7]:
twitter_data.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [8]:
# naming the column
col_name = ['target','id','date','flag','user','text']
twitter_data = pd.read_csv('twitter_data.csv',names=col_name,encoding= 'ISO-8859-1')

In [9]:
twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [10]:
twitter_data.shape

(1600000, 6)

In [11]:
#counting the null value in the dataset 
twitter_data.isnull().sum()

target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

In [12]:
#checking the distribution of target columns
twitter_data['target'].value_counts()

target
0    800000
4    800000
Name: count, dtype: int64

In [13]:
#  converting the target data to 1 or 0
twitter_data.replace({'target':{4:1}},inplace=True)

In [14]:
twitter_data['target'].value_counts()

target
0    800000
1    800000
Name: count, dtype: int64

### 0-for neg and 1-for pos

Stemming -> a process of reducing a word to its root word
**eg: actor, actress, acting  -----> act**

In [17]:
import string
stopwords_list = set(stopwords.words('english'))

# Define negation words to be excluded from the stopwords list
negation_words = ['not', 'never', 'nor','no']

# Remove negation words from the stopwords list
custom_stopwords = stopwords_list - set(negation_words)

# Convert the set back to a list if needed
custom_stopwords_list = list(custom_stopwords)
port_stem = PorterStemmer()

def stemming (content):
     # Remove Twitter handles
    content = content.translate(str.maketrans('', '', string.punctuation)) 
    content = re.sub(r'@[\w]+', '', content)
    
    # Remove website links
    content = re.sub(r'http\S+', '', content)
      
    
    # Stemming and removing stopwords
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)  # remove non-alphabetic characters
    stemmed_content = re.sub(r'\bnot\s+(good|bad)\b', r'not_\1', content) # remove non alphabetic characters and replace them with
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in custom_stopwords_list]
    stemmed_content = ' '.join(stemmed_content)
    
    return stemmed_content

In [18]:
twitter_data['stemmed_content'] = twitter_data['text'].apply(stemming)

In [19]:
empty_string_count = (twitter_data['stemmed_content'] == '').sum()
print("Number of empty strings in 'stemmed' column:", empty_string_count)


Number of empty strings in 'stemmed' column: 367


In [20]:
nan_counts = twitter_data.isna().sum()
print(nan_counts)
twitter_data['stemmed_content'].replace({np.nan: ''}, inplace=True)

target             0
id                 0
date               0
flag               0
user               0
text               0
stemmed_content    0
dtype: int64


In [21]:
#dividing data and label

X = twitter_data['stemmed_content'].values
Y = twitter_data['target'].values

In [22]:
print(X)
print(len(X))

['switchfoot awww that bummer shoulda got david carr third day'
 'upset cant updat facebook text might cri result school today also blah'
 'kenichan dive mani time ball manag save 50 rest go bound' ...
 'readi mojo makeov ask detail'
 'happi 38th birthday boo alll time tupac amaru shakur'
 'happi charitytuesday thenspcc sparkschar speakinguph4h']
1600000


In [23]:
print(Y)
print(len(Y))

[0 0 0 ... 1 1 1]
1600000


# spliting the data for training set and test set

In [24]:

X_train , x_test, Y_train, y_test = train_test_split(X,Y,test_size=0.2, stratify=Y, random_state=2)

In [25]:
print(X.shape,X_train.shape,x_test.shape)

(1600000,) (1280000,) (320000,)


In [26]:
print(X_train)

['watch saw iv drink lil wine' 'hatermagazin im'
 'even though favourit drink think vodka coke wipe mind time think im gonna find new drink'
 ... 'eager monday afternoon'
 'hope everyon mother great day cant wait hear guy store tomorrow'
 'love wake folger bad voic deeper']


In [27]:
print(x_test)

['mmangen fine havent much time chat twitter hubbi back summer amp tend domin free time'
 'ah may show w ruth kim amp geoffrey sanhueza'
 'ishatara mayb bay area thang dammit' ...
 'destini41 nevertheless hooray 4700 member wonder safe trip'
 'not feel well' 'supersandro thank']


# converting the textual data to numerical data

In [28]:


# Now, you can use TfidfVectorizer
vec = TfidfVectorizer()
X_train = vec.fit_transform(X_train)
x_test = vec.transform(x_test)

In [30]:
print(X_train)

  (0, 591876)	0.44891492343580713
  (0, 312647)	0.42014900150965
  (0, 155489)	0.3750098954589296
  (0, 254376)	0.5289561259598358
  (0, 474228)	0.3572370305723539
  (0, 582616)	0.2718545698574146
  (1, 244969)	0.22836330961538626
  (1, 222065)	0.973575985129824
  (2, 380998)	0.16860554736549352
  (2, 184784)	0.20354985174785872
  (2, 208235)	0.18890956278192736
  (2, 545821)	0.1531104986230319
  (2, 353021)	0.24376553114436394
  (2, 592410)	0.33682686604863493
  (2, 114835)	0.31552336870730724
  (2, 578594)	0.3320585866029914
  (2, 541780)	0.32250540907491615
  (2, 181192)	0.2915730368794691
  (2, 543166)	0.18830793961977077
  (2, 173822)	0.190483603871252
  (2, 244969)	0.1180511629806638
  (2, 155489)	0.462183482268813
  (3, 238682)	0.37429026048276426
  (3, 549145)	0.27219634607773335
  (3, 519522)	0.39455048558011296
  :	:
  (1279996, 579162)	0.2683005175052005
  (1279996, 420280)	0.2099309242086821
  (1279996, 316212)	0.2215976162373984
  (1279996, 384148)	0.1791753858133457
  (12

In [31]:
print(x_test)

  (0, 563026)	0.1756699402217999
  (0, 545821)	0.3072040271759361
  (0, 532403)	0.3387504658473678
  (0, 519087)	0.21461214823624056
  (0, 369424)	0.17345788426159697
  (0, 358373)	0.4401141331587346
  (0, 237218)	0.2740454324316276
  (0, 222396)	0.21804382680141884
  (0, 192526)	0.23191594839385007
  (0, 184857)	0.24911510342827806
  (0, 151107)	0.3587121149062816
  (0, 102152)	0.2612485953668604
  (0, 55013)	0.1582670956324974
  (0, 33734)	0.17235539692279134
  (1, 489295)	0.24292921280281687
  (1, 466176)	0.4807803526282237
  (1, 340344)	0.28410353142225014
  (1, 291620)	0.40729191354880223
  (1, 201755)	0.5694262443476116
  (1, 33734)	0.21432446713542896
  (1, 22613)	0.30504574365189047
  (2, 534501)	0.43400892781709194
  (2, 340427)	0.257847808001537
  (2, 251669)	0.5864246145812769
  (2, 130185)	0.36311866719989483
  :	:
  (319995, 556785)	0.26642234705607926
  (319995, 428516)	0.4120965690275366
  (319995, 416876)	0.31944600616293284
  (319995, 309905)	0.28855362651379807
  (319

In [None]:
X_train.shape


(1280000, 616940)

In [None]:
x_test.shape

(320000, 616940)

# Taining ML model with logistic regerassion 

In [32]:
model = LogisticRegression(max_iter=10000)
#model = gaussianNB()
model.fit(X_train,Y_train)

Model evaluation

In [33]:
# accuracy score on training data
X_train_predict = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_predict)

In [34]:
print("Accuracy on Training data",training_data_accuracy*100,"%")

Accuracy on Training data 81.196796875 %


In [35]:
X_test_predict = model.predict(x_test)
training_data_accuracy = accuracy_score(y_test, X_test_predict)
print("Accuracy on Test data",training_data_accuracy*100,"%")

Accuracy on Test data 78.9740625 %


In [40]:
def prediction(sentence):
    enter = sentence
    li = stemming(enter)
    custom_test = vec.transform([li])
    test1 = model.predict(custom_test)
    if test1 == 1 :
        return 1
    else:
        return 0

# Saving the tained model

In [41]:
import pickle

In [46]:

filename = 'sentiment_model.sav'
pickle.dump(model,open(filename,'wb'))
pickle.dump(vec, open('tfidf_vectorizer.sav', 'wb'))

# How to use saved Model for future prediciton

In [43]:
loaded_model = pickle.load(open('sentiment_model.sav', 'rb'))

In [45]:
enter = "I'm not looking forward to the meeting"
li = stemming(enter)
custom_test = vec.transform([li])
test1 = model.predict(custom_test)
if test1 == 1 :
    print("✅😍: it is a positive tweet")
else:
    print("❌😒: it is a positive tweet")



❌😒: it is a positive tweet
