# **EDA & Data Preprocessing**

## **1.0 Import Libraries**

In [150]:
# For Data Manipulation
import pandas as pd
# For data Visualization
import seaborn as sns
import matplotlib.pyplot as plt
# Regular Expressions
import re
# Stemmer
from nltk.stem import SnowballStemmer
# Stopwords
from nltk.corpus import stopwords
# Word Tokenizer
from nltk.tokenize import word_tokenize
# Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# Train Test Split
from sklearn.model_selection import train_test_split
# for saving vectorizer
import pickle

## **2.0 Load Dataset**

In [151]:
columns=['Sentiment', 'ids', 'date', 'flag', 'user', 'Tweet']
df = pd.read_csv("../Artifacts/raw_data.csv", header=None, names=columns, encoding='latin-1')

pd.set_option('display.max_colwidth', None)

## **3.0 Exploratory Data Analysis**

In [152]:
# First 5 Rows
df.head()

Unnamed: 0,Sentiment,ids,date,flag,user,Tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there."


- As we need to predict sentiments of tweets, So only Tweet and Sentiment features are enough.

In [153]:
df=df[["Tweet","Sentiment"]]
df.head()

Unnamed: 0,Tweet,Sentiment
0,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D",0
1,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!,0
2,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds,0
3,my whole body feels itchy and like its on fire,0
4,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there.",0


In [154]:
# Gives Shape of Dataset
df.shape

(1600000, 2)

- **Null Values Detection**

In [155]:
# Null Values Findings
df.isnull().sum()

Tweet        0
Sentiment    0
dtype: int64

In [156]:
# Checking if any tweet is empty
df[df["Tweet"]==''].head()

Unnamed: 0,Tweet,Sentiment


In [157]:
# Information Of Dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 2 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   Tweet      1600000 non-null  object
 1   Sentiment  1600000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 24.4+ MB


- **Duplicates Checking**

In [158]:
# Checking if any Duplicate Present
df.duplicated().any()

np.True_

In [159]:
# Count of Duplicates
df.duplicated().sum()

np.int64(16309)

In [160]:
df["Sentiment"].unique()

array([0, 4])

- We need to convert Sentiment to Binary Value, So we will convert 4 to 1(positive).

In [161]:
df.loc[df["Sentiment"]==4,"Sentiment"]=1

df["Sentiment"].unique()

array([0, 1])

In [162]:
df["Sentiment"].value_counts()

Sentiment
0    800000
1    800000
Name: count, dtype: int64

**Observations:**

- No empty Tweets or Null Values Found.
- 16309 Duplicates found, which need to be **dropped**.
- Data is completely balanced.

## **4.0 Data Preprocessing**

- **Dropping empty tweets.**

In [163]:
df.drop(df[df["Tweet"] == ""].index, inplace=True)

- **Dropping Duplicates**

In [164]:
df.drop_duplicates(inplace=True)

In [165]:
df.duplicated().any()

np.False_

- **Text Cleaner Function:**

In [166]:
df.head()

Unnamed: 0,Tweet,Sentiment
0,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D",0
1,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!,0
2,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds,0
3,my whole body feels itchy and like its on fire,0
4,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there.",0


In [167]:
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
import re

stem = SnowballStemmer("english")
stopwords = set(stopwords.words("english"))

def text_preprocessor(tweet):
    # Removes mentioned person
    tweet = re.sub(r'@\w+', '', tweet)
    # Remove URLs
    tweet = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', tweet)
    # Convert to lowercase
    tweet = tweet.lower() 
    # Remove special characters
    tweet = re.sub(r"[^a-zA-Z0-9\s]", '', tweet)
    # Split into words, remove stop words, and stem
    words = [stem.stem(word) for word in tweet.split() if word not in stopwords] 
    # Join words back into a single string (correct this part)
    tweet = ' '.join(words)
    return tweet

In [168]:
# Apply the text_preprocessor function to the "Tweet" column
df["Tweet"] = df["Tweet"].apply(text_preprocessor)

In [169]:
df.head()

Unnamed: 0,Tweet,Sentiment
0,awww that bummer shoulda got david carr third day,0
1,upset cant updat facebook text might cri result school today also blah,0
2,dive mani time ball manag save 50 rest go bound,0
3,whole bodi feel itchi like fire,0
4,behav im mad cant see,0


## **5.0 Split into X and y**

In [170]:
X=df["Tweet"]
y=df["Sentiment"]

## **6.0 Train Test Split**

In [171]:
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

## **7.0 Generating Vectorizer**

In [172]:
vectorizer=TfidfVectorizer()
vectorizer.fit(x_train)

In [173]:
x_train.shape

(1266952,)

## **8.0 Saving preprocessor and preprocessed data**

In [174]:
with open("../Artifacts/vectorizer.pkl","wb") as f:
    pickle.dump(vectorizer, f)


In [175]:
x_train.to_csv("../Artifacts/x_train.csv",index=False)
x_test.to_csv("../Artifacts/x_test.csv",index=False)
y_train.to_csv("../Artifacts/y_train.csv",index=False)
y_test.to_csv("../Artifacts/y_test.csv",index=False)