In [58]:
import requests
import zipfile
import pandas as pd
import re
from nltk import download
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [33]:
download('popular')

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/names.zip.
[nltk_data]    | Downloading package shakespeare to /root/nltk_data...
[nlt

True

## Dataset

In [11]:
def download_url(url, save_path='', chunk_size=1024):
  r = requests.get(url, stream=True)
  with open('download.zip', 'wb') as fd:
    for chunk in r.iter_content(chunk_size=chunk_size):
      fd.write(chunk)

  with zipfile.ZipFile('download.zip', 'r') as zip_ref:
    zip_ref.extractall(save_path)

In [13]:
dataset_url = 'https://drive.usercontent.google.com/download?id=1AlH9pTZc8WUNJNk9-0wZLoSCC9JjxDrf&export=download&authuser=0&confirm=t&uuid=b43dd9da-4b60-4137-bfed-32c8b1204d76&at=APZUnTW9uyiM-1qBgq13iDSggyNc%3A1706098258054'
download_url(dataset_url)

dataset_path = 'training.1600000.processed.noemoticon.csv'

In [49]:
data = pd.read_csv(dataset_path, encoding='ISO-8859-1')
data.head()

## Data Preprocessing

In [52]:
# renaming the columns because first entry is being taken as column names
column_names = ['target', 'id', 'data', 'flag', 'user', 'text']
data = pd.read_csv(dataset_path, names=column_names, encoding='ISO-8859-1')
data.head()

Unnamed: 0,target,id,data,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


Target  
0 - Negative Tweet  
4 - Positive Tweet

In [56]:
# Target Values are (4 and 0), changing it to (1 and 0)
data.replace({"target": {4:1}}, inplace=True)
data['target'].value_counts()

0    800000
1    800000
Name: target, dtype: int64

Target  
0 - Negative Tweet  
1 - Positive Tweet

In [53]:
def preprocess_text(text):
  text = text.lower()

  text = re.sub(r'[^\w\s]', '', text)

  tokens = word_tokenize(text)

  stop_words = set(stopwords.words('english'))

  tokens = [word for word in tokens if word not in stop_words]

  # stemmer = PorterStemmer()
  # tokens = [stemmer.stem(word) for word in tokens]

  lemmatizer = WordNetLemmatizer()
  tokens = [lemmatizer.lemmatize(word) for word in tokens]

  processed_text = ' '.join(tokens)

  return processed_text

In [54]:
# Example usage
input_text = "my whole body feels itchy and like its on fire"
processed_text = preprocess_text(input_text)
print(processed_text)

whole body feel itchy like fire


In [57]:
data['lemmatized_data'] = data['text'].apply(preprocess_text)
data.head()

Unnamed: 0,target,id,data,flag,user,text,lemmatized_data
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot httptwitpiccom2y1zl awww thats bumm...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset cant update facebook texting might cry r...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dived many time ball managed save 50 ...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole body feel itchy like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behaving im mad cant see


In [63]:
X = data['lemmatized_data']
y = data['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=2)
X.shape, X_train.shape, X_test.shape

((1600000,), (1280000,), (320000,))

In [64]:
# converting the textual data to numerical data
vectorizer = TfidfVectorizer()

# Here we assign importance to each indiviual word

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

<1280000x715476 sparse matrix of type '<class 'numpy.float64'>'
	with 9643353 stored elements in Compressed Sparse Row format>

# Training the Model

## Logistic Regression

In [66]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Model Evaluation

## Accuracy Score

In [67]:
X_test_prediction = model.predict(X_test)
accuracy_score = accuracy_score(y_test, X_test_prediction)
print("Accuracy Score: ", accuracy_score)

Accuracy Score:  0.78820625


# Saving the Trained Model

In [68]:
import pickle

In [70]:
filename = 'trained_model.sav'
pickle.dump(model, open(filename, 'wb'))

## Using the saved model for future predictions

In [71]:
loaded_model = pickle.load(open(filename, 'rb'))

In [73]:
X_new = X_test[200]
loaded_model.predict(X_new)


array([1])