**Fetching data via kaggle api**

In [None]:
#Installing kaggle library
!pip install kaggle



In [None]:
#Configuring the path of the kaggle.json file
!mkdir -p ~/.kaggle
!mv /content/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

mv: cannot stat '/content/kaggle.json': No such file or directory
chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory


**Importing dataset**

In [None]:
#API to fetch dataset from kaggle
!kaggle datasets download -d kazanova/sentiment140

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
sentiment140.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
#extracting csv files from zip file
from zipfile import ZipFile

kaggle_zip = '/content/sentiment140.zip'

with ZipFile(kaggle_zip, 'r') as zip:
  zip.extractall()

print('Successfully extracted')

Successfully extracted


**Importing required dependencies**

In [None]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
stp = stopwords.words('english')
stp[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

**Data Loading**

In [None]:
columns = ['target', 'ids', 'date', 'flag', 'user', 'text']
X_data = pd.read_csv('/content/training.1600000.processed.noemoticon.csv',names = columns, encoding='ISO-8859-1')


In [None]:
X_data.shape

(1600000, 6)

In [None]:
X_data.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


**Data Understanding**

In [None]:
#missisng values
X_data.isna().sum()

Unnamed: 0,0
target,0
ids,0
date,0
flag,0
user,0
text,0


In [None]:
#
X_data.dtypes.to_frame().T

Unnamed: 0,target,ids,date,flag,user,text
0,int64,int64,object,object,object,object


In [None]:
#
X_data['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
4,800000


In [None]:
#Changing the target labels from 4 to 1 for positive sentiments
X_data['target'].replace(4, 1, inplace = True)
X_data['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
1,800000


**Stemming**

In [None]:
stmr = PorterStemmer()

In [None]:
def stemmer(content):
    # Tokenize the content into words
    words = word_tokenize(content.lower())

    # Filter out non-alphabetic tokens and remove stopwords
    stop_words = set(stopwords.words('english'))# converting stop words to a set for faster lookup
    filtered_words = [word for word in words if word.isalpha() and word not in stop_words]

    # Apply stemming to the filtered words
    stemmed_words = [stmr.stem(word) for word in filtered_words]

    # Join the stemmed words back into a single string
    return ' '.join(stemmed_words)


In [None]:
X_data['stemmed_text'] = X_data['text'].apply(stemmer)

In [None]:
X_data.head()

Unnamed: 0,target,ids,date,flag,user,text,stemmed_text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot http awww bummer shoulda got david ...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset ca updat facebook text might cri result ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dive mani time ball manag save rest g...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behav mad ca see


In [None]:
#separating data and label
x = X_data['stemmed_text'].values
y = X_data['target'].values

In [None]:
print(x)

['switchfoot http awww bummer shoulda got david carr third day'
 'upset ca updat facebook text might cri result school today also blah'
 'kenichan dive mani time ball manag save rest go bound' ...
 'readi mojo makeov ask detail'
 'happi birthday boo alll time tupac amaru shakur'
 'happi charitytuesday thenspcc sparkschar']


In [None]:
print(y)

[0 0 0 ... 1 1 1]


Splitting data to training data and test data

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y, random_state = 23)

In [None]:
print(x.shape, x_train.shape, x_test.shape)

(1600000,) (1280000,) (320000,)


In [None]:
#converting textual data to numerical data
vectorizer = TfidfVectorizer()

x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

In [None]:
print(x_train)

  (0, 3447)	0.4172596323202707
  (0, 150433)	0.2509115422804653
  (0, 315428)	0.873463105795769
  (1, 243225)	0.17439569570535263
  (1, 233633)	0.15900412969577687
  (1, 147429)	0.4357788214936311
  (1, 102765)	0.41887722674702377
  (1, 46711)	0.36789716527696176
  (1, 296104)	0.2800312779252936
  (1, 132264)	0.2282443277297421
  (1, 33395)	0.23101224900482442
  (1, 140290)	0.22304366852387744
  (1, 334158)	0.1683052159384444
  (1, 113342)	0.2635760867411824
  (1, 381544)	0.2114400272453593
  (1, 393093)	0.20470167360337588
  (1, 262343)	0.1598668350944018
  (2, 276222)	0.8026157154778231
  (2, 69646)	0.5964964486633781
  (3, 6412)	0.4169424292857164
  (3, 109374)	0.1863005013871226
  (3, 273785)	0.18326002360913798
  (3, 121133)	0.25896338388093465
  (3, 35067)	0.17813590871149437
  (3, 304786)	0.3379529065833255
  :	:
  (1279995, 274537)	0.3954575235902292
  (1279996, 202380)	0.48398195281186607
  (1279996, 281071)	0.3947317896718137
  (1279996, 190450)	0.397382748583509
  (1279996, 

In [None]:
print(x_test)

  (0, 354316)	0.2185044910041192
  (0, 337566)	0.2481427159647123
  (0, 244193)	0.2591919907095813
  (0, 206784)	0.23368991364282826
  (0, 132386)	0.1635254618009174
  (0, 114649)	0.30821266522520085
  (0, 112935)	0.26720869947721004
  (0, 105843)	0.24196063376059349
  (0, 91694)	0.5739815406137783
  (0, 56033)	0.28986135648791395
  (0, 33110)	0.32207769273789155
  (1, 385615)	0.43638423290054584
  (1, 311224)	0.7714006881357869
  (1, 122062)	0.46315200487468117
  (2, 334158)	0.30114720100683223
  (2, 276008)	0.4933633372662402
  (2, 255820)	0.625187335838889
  (2, 123446)	0.3298650042173529
  (2, 68969)	0.407716635503908
  (3, 356827)	0.2483601629452412
  (3, 339238)	0.25273460825139965
  (3, 326695)	0.29340402456927794
  (3, 305649)	0.37794468710087203
  (3, 280819)	0.2616317628747514
  (3, 269037)	0.5929656813180381
  :	:
  (319994, 47159)	0.4590933854638578
  (319994, 47139)	0.4552152820317149
  (319994, 18950)	0.34640656026914113
  (319995, 366388)	0.31115463461932025
  (319995, 3

Modeling

In [None]:
model = LogisticRegression(max_iter = 1000)

In [None]:
model.fit(x_train, y_train)

Model Evaluation

In [None]:
y_pred = model.predict(x_test)
acc_score = accuracy_score(y_test, y_pred)
print(acc_score)

0.77975
