In [3]:
#Installing kaggle library
! pip install kaggle



**Uploading the kaggle.json file**

In [55]:
import os
import shutil

# Path to your downloaded kaggle.json
source = "kaggle.json"  # Ensure it's in the current working directory

# Create ~/.kaggle directory
kaggle_dir = os.path.join(os.path.expanduser("~"), ".kaggle")
os.makedirs(kaggle_dir, exist_ok=True)

# Destination path for kaggle.json
destination = os.path.join(kaggle_dir, "kaggle.json")

# Copy the file
shutil.copy(source, destination)

# Optional: Try setting permission (Unix-style, won't do anything on Windows)
try:
    os.chmod(destination, 0o600)
except:
    print("chmod not supported on Windows — skipping")


Importing Twitter sentiment dataaset


In [7]:
#API to fetch the dataset fro the kaggle
!kaggle datasets download -d kazanova/sentiment140


Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
sentiment140.zip: Skipping, found more recently modified local copy (use --force to force download)


In [8]:
#extracting the compressed dataset
import zipfile

with zipfile.ZipFile("sentiment140.zip", 'r') as zip_ref:
    zip_ref.extractall("sentiment140")


In [9]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [10]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sumanth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

***Data_Processing***

In [13]:
twitter = pd.read_csv("sentiment140/training.1600000.processed.noemoticon.csv", encoding='latin-1')
twitter.head()


Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [14]:
col_names = ['target' , 'id' , 'date' , 'flag' , 'user' , 'text']
twitter.columns = col_names

In [15]:
twitter.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [16]:
twitter.shape

(1599999, 6)

In [17]:
#checking for missing value
twitter.isnull().sum()


target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

In [18]:
# Distribution of tweets
twitter['target'].value_counts()

target
4    800000
0    799999
Name: count, dtype: int64

In [21]:
# Converting 0 to -ve and 4 to +ve
twitter['target'] = twitter['target'].map({0:0 , 4:1})

In [22]:
twitter['target'].value_counts()

target
1    800000
0    799999
Name: count, dtype: int64

0---> Negative tweet
1---> Positive tweet

In [37]:
# Stemming

stremmer = PorterStemmer()

def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content) # removing not a-z and A-Z
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [stremmer.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [47]:
twitter['text'] = twitter['text'].apply(stemming)

In [48]:
twitter.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,upset updat facebook text might cri result sch...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,kenichan dive mani time ball manag save rest g...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,whole bodi feel itchi like fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,nationwideclass behav mad see
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,kwesidei whole crew


In [49]:
twitter.describe()

Unnamed: 0,target,id
count,1599999.0,1599999.0
mean,0.5,1998818000.0
std,0.5,193575700.0
min,0.0,1467811000.0
25%,0.0,1956916000.0
50%,1.0,2002102000.0
75%,1.0,2177059000.0
max,1.0,2329206000.0


In [50]:
twitter['text']

0          upset updat facebook text might cri result sch...
1          kenichan dive mani time ball manag save rest g...
2                            whole bodi feel itchi like fire
3                              nationwideclass behav mad see
4                                        kwesidei whole crew
                                 ...                        
1599994                           woke school best feel ever
1599995    thewdb com cool hear old walt interview http b...
1599996                         readi mojo makeov ask detail
1599997    happi th birthday boo alll time tupac amaru sh...
1599998    happi charitytuesday thenspcc sparkschar speak...
Name: text, Length: 1599999, dtype: object

In [51]:
twitter['target']

0          0
1          0
2          0
3          0
4          0
          ..
1599994    1
1599995    1
1599996    1
1599997    1
1599998    1
Name: target, Length: 1599999, dtype: int64

In [65]:
#Seperating the data and label
X=twitter['text'].values
Y=twitter['target'].values

In [67]:
print(X)

['upset updat facebook text might cri result school today also blah'
 'kenichan dive mani time ball manag save rest go bound'
 'whole bodi feel itchi like fire' ... 'readi mojo makeov ask detail'
 'happi th birthday boo alll time tupac amaru shakur'
 'happi charitytuesday thenspcc sparkschar speakinguph h']


In [69]:
print(Y)

[0 0 0 ... 1 1 1]


**Splitting the dataset**

In [72]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)

In [74]:
print(X.shape,X_train.shape,X_test.shape)

(1599999,) (1279999,) (320000,)


In [76]:
print(X_train)

['watch saw iv drink lil wine'
 'hire anoth employ gourmet point current hurri'
 'punish know much work tomorrow everyon el get day' ...
 'ohjustjak awkward crush nph amazingg'
 'oliveandfig wow never tweet nah yet tuesday spoil'
 'girl jenn stay twitter b c got back late amp tri relax enough sleep work amp awak']


In [78]:
print(X_test)

['school someon slap acrosss face cau stay class total dragggg poor poor kitti'
 'ah may show w ruth kim amp geoffrey sanhueza'
 'dad comput turn even though weird mean download song want' ...
 'destini nevertheless hooray member wonder safe trip'
 'strawberri heavi appar broke blender epic fuck fail' 'supersandro thank']


In [80]:
#converting the textual data into numerical data

vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [81]:
print(X_train)

  (0, 439408)	0.44762339349971236
  (0, 232907)	0.4196865041786133
  (0, 108277)	0.3747208594840889
  (0, 183417)	0.5297243757363407
  (0, 351633)	0.358080585111485
  (0, 433080)	0.2724907220193063
  (1, 170765)	0.38341146936709763
  (1, 85874)	0.3344071127272604
  (1, 317097)	0.31686039462323956
  (1, 150605)	0.49831089686777047
  (1, 118584)	0.4056560562783832
  (1, 19557)	0.2561195540961835
  (1, 165291)	0.4028637567608497
  (2, 92927)	0.22401910408993028
  (2, 144626)	0.22201335319858084
  (2, 115119)	0.3918827692914423
  (2, 123555)	0.3327092648175363
  (2, 409159)	0.29054638636822067
  (2, 441984)	0.23639137397901783
  (2, 276692)	0.28590989090285757
  (2, 218788)	0.2634557236875039
  (2, 323585)	0.5872022512373418
  (3, 346770)	0.30875671248934655
  (3, 45828)	0.6887701599685523
  (3, 329070)	0.4656148468373355
  :	:
  (1279996, 13729)	0.4358882391561783
  (1279996, 28817)	0.37407281062427566
  (1279996, 84393)	0.34250398106721314
  (1279997, 298348)	0.6133068151543581
  (127999

In [82]:
print(X_test)

  (0, 410629)	0.2624887713410733
  (0, 379225)	0.25412523551389005
  (0, 372528)	0.2465421614524031
  (0, 368278)	0.39729799899853624
  (0, 352927)	0.2236525280436605
  (0, 317990)	0.5238681067395509
  (0, 217211)	0.3295702657623481
  (0, 125810)	0.27311153815262845
  (0, 74553)	0.26554132821092297
  (0, 63121)	0.26403476346689203
  (1, 363243)	0.2452201959374943
  (1, 345247)	0.4737224969291345
  (1, 254505)	0.28612232656435765
  (1, 215519)	0.4026099582514783
  (1, 144035)	0.5774505753926529
  (1, 14936)	0.21044140067797668
  (1, 6369)	0.306200158434202
  (2, 434969)	0.37061314587952165
  (2, 432522)	0.22646792563778628
  (2, 415371)	0.33260415513881697
  (2, 403969)	0.27063294915166347
  (2, 372798)	0.3021012978792868
  (2, 256421)	0.30704092386413834
  (2, 123274)	0.27300780284477716
  (2, 106855)	0.3686061494114417
  :	:
  (319995, 304533)	0.38406709482393575
  (319995, 191788)	0.3731725378565657
  (319995, 176482)	0.3076638243141888
  (319995, 69475)	0.32473091751862404
  (319995

## Training the machine learning model

## Logistic regression

In [88]:
model = LogisticRegression(max_iter=1000)

In [90]:
model.fit(X_train,Y_train)

### Model Evaluation

In [92]:
#accuracy score on the training dta
X_train_pred = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train,X_train_pred)

In [93]:
print('Accuracy score on the training data:',training_data_accuracy)

Accuracy score on the training data: 0.8101264141612611


In [97]:
#accuracy score on the testing  dta
X_test_pred = model.predict(X_test)
testing_data_accuracy = accuracy_score(Y_test,X_test_pred)

In [99]:
print('Accuracy score on the testing data:',testing_data_accuracy)

Accuracy score on the testing data: 0.777746875


### Model Accuracy =77.8%

#### Saving the trained model

In [103]:
import pickle

In [105]:
filename = 'trained_model.sav'
pickle.dump(model, open(filename,'wb'))

In [107]:
# Function to predict the sentiment
def predict_sentiment(text):
    text = re.sub('[^a-zA-Z]',' ',text) # removing not a-z and A-Z
    text = text.lower()
    text = text.split() 
    text = [stremmer.stem(word) for word in text if not word in stopwords.words('english')]
    text = ' '.join(text)
    text = [text]
    text = vectorizer.transform(text)   
    sentiment = model.predict(text)
    if sentiment == 0:
        return "Negative"
    else:
        return "Positive"

In [109]:
# Testing the model
print(predict_sentiment("I hate you"))
print(predict_sentiment("I love you"))

Negative
Positive


In [111]:
#loading the saved model
loaded_model = pickle.load(open('trained_model.sav','rb'))

In [113]:
X_new=X_test[200]
print(Y_test[200])

prediction=loaded_model.predict(X_new)
print(prediction)

if(prediction[0]==0):
    print('Negative Tweet')

else:
    print('Positive  Tweet')

1
[1]
Positive  Tweet


In [115]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Train the SVM model
svm_model = LinearSVC()
svm_model.fit(X_train,Y_train)

# Predict on test data
Y_pred_svm = svm_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(Y_test, Y_pred_svm)
print(f"SVM Accuracy: {accuracy * 100:.2f}%")

print("\nClassification Report:")
print(classification_report(Y_test, Y_pred_svm))

print("\nConfusion Matrix:")
print(confusion_matrix(Y_test, Y_pred_svm))


SVM Accuracy: 76.93%

Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.76      0.77    160000
           1       0.76      0.78      0.77    160000

    accuracy                           0.77    320000
   macro avg       0.77      0.77      0.77    320000
weighted avg       0.77      0.77      0.77    320000


Confusion Matrix:
[[121237  38763]
 [ 35060 124940]]
