# **Twitter Sentiment Analysis**

**1) Importing Packages**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import re

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import warnings
warnings.filterwarnings("ignore")

**2) Importing Dataset**

In [None]:
# Loading the data into dataframe
df = pd.read_csv('/kaggle/input/twitter-sentiment-dataset/Twitter_Data.csv')

**3) Analysing the Data**

In [None]:
# Looking at data
df.sample(5)

In [None]:
# Number of rows and columns
df.shape

In [None]:
# Deatiled information about data
df.info()

In [None]:
# Checking the missing values
df.isnull().sum()

In [None]:
# Dropping the values from dataframe
df = df.dropna()

In [None]:
# Checking the distribution of category
df['category'].value_counts()

In [None]:
# Checking the categories distribution in percentage
plt.figure(figsize=[4, 4])
plt.pie(df['category'].value_counts(), labels=df['category'].value_counts().index, autopct='%1.1f%%')
plt.title('Percentage share: Positive Tweet vs Negative Tweet vs Neutral Tweet')
plt.show()

**4) Data Preprocessing**

In [None]:
import nltk
nltk.download('stopwords', quiet=True)

In [None]:
pattern =re.compile('[^a-zA-Z]')

english_stopwords = stopwords.words('english')

port_stemmer = PorterStemmer()

In [None]:
def preprocessed_text(text):

  stemmed_content = re.sub(pattern,' ',text)
  stemmed_content = stemmed_content.lower()

  stemmed_content = stemmed_content.split()

  stemmed_content = [port_stemmer.stem(word) for word in stemmed_content if not word in english_stopwords]
  stemmed_content = ' '.join(stemmed_content)


  return stemmed_content

In [None]:
# Applying the function
df['stemmed_content'] = df['clean_text'].apply(preprocessed_text)

In [None]:
df.head()

**5) Data Visualization**

In [None]:
# Segrating based on different sentiments
df_negative = df[df["category"]==-1]
df_positive = df[df["category"]==1]
df_neutral = df[df["category"]==0]

In [None]:
from wordcloud import WordCloud,STOPWORDS

In [None]:
# Visualize the overall frequent words
all_words_raw = " ".join([sentence for sentence in df['clean_text']])
all_words_processed = " ".join([sentence for sentence in df['stemmed_content']])

wordcloud_raw = WordCloud(width=800, height=400, random_state=21, max_font_size=110).generate(all_words_raw)
wordcloud_processed = WordCloud(width=800, height=400, random_state=21, max_font_size=110).generate(all_words_processed)

# Plot the word clouds in a single figure with subplots
fig, axes = plt.subplots(1, 2, figsize=(20, 10))

# Plot the raw text word cloud
axes[0].imshow(wordcloud_raw, interpolation='bilinear')
axes[0].set_title('Word Cloud Before Preprocessing', fontsize=20)
axes[0].axis('off')

# Plot the processed text word cloud
axes[1].imshow(wordcloud_processed, interpolation='bilinear')
axes[1].set_title('Word Cloud After Preprocessing', fontsize=20)
axes[1].axis('off')

plt.show()

In [None]:
# Visualize the frequent words for positive tweets
all_words_raw_positive = " ".join([sentence for sentence in df_positive['clean_text']])
all_words_processed_positive = " ".join([sentence for sentence in df_positive['stemmed_content']])

wordcloud_raw_positive = WordCloud(width=800, height=400, random_state=21, max_font_size=110, background_color='white').generate(all_words_raw_positive)
wordcloud_processed_positive = WordCloud(width=800, height=400, random_state=21, max_font_size=110, background_color='white').generate(all_words_processed_positive)

# Plot the word clouds in a single figure with subplots
fig, axes = plt.subplots(1, 2, figsize=(20, 10))

# Plot the raw text word cloud
axes[0].imshow(wordcloud_raw_positive, interpolation='bilinear')
axes[0].set_title('Word Cloud for Positive Tweets Before Preprocessing', fontsize=20)
axes[0].axis('off')

# Plot the processed text word cloud
axes[1].imshow(wordcloud_processed_positive, interpolation='bilinear')
axes[1].set_title('Word Cloud for Positive Tweets After Preprocessing', fontsize=20)
axes[1].axis('off')

plt.show()

In [None]:
# Visualize the frequent words for negative tweets
all_words_raw_negative = " ".join([sentence for sentence in df_negative['clean_text']])
all_words_processed_negative = " ".join([sentence for sentence in df_negative['stemmed_content']])

wordcloud_raw_negative = WordCloud(width=800, height=400, random_state=21, max_font_size=110).generate(all_words_raw_negative)
wordcloud_processed_negative = WordCloud(width=800, height=400, random_state=21, max_font_size=110).generate(all_words_processed_negative)

# Plot the word clouds in a single figure with subplots
fig, axes = plt.subplots(1, 2, figsize=(20, 10))

# Plot the raw text word cloud
axes[0].imshow(wordcloud_raw_negative, interpolation='bilinear')
axes[0].set_title('Word Cloud for Negative Tweets Before Preprocessing', fontsize=20)
axes[0].axis('off')

# Plot the processed text word cloud
axes[1].imshow(wordcloud_processed_negative, interpolation='bilinear')
axes[1].set_title('Word Cloud for Negative Tweets After Preprocessing', fontsize=20)
axes[1].axis('off')

plt.show()

In [None]:
# Visualize the frequent words for neutral tweets
all_words_raw_neutral = " ".join([sentence for sentence in df_neutral['clean_text']])
all_words_processed_neutral = " ".join([sentence for sentence in df_neutral['stemmed_content']])

wordcloud_raw_neutral = WordCloud(width=800, height=400, random_state=21, max_font_size=110, background_color='white').generate(all_words_raw_neutral)
wordcloud_processed_neutral = WordCloud(width=800, height=400, random_state=21, max_font_size=110, background_color='white').generate(all_words_processed_neutral)

# Plot the word clouds in a single figure with subplots
fig, axes = plt.subplots(1, 2, figsize=(20, 10))

# Plot the raw text word cloud
axes[0].imshow(wordcloud_raw_neutral, interpolation='bilinear')
axes[0].set_title('Word Cloud for Neutral Tweets Before Preprocessing', fontsize=20)
axes[0].axis('off')

# Plot the processed text word cloud
axes[1].imshow(wordcloud_processed_neutral, interpolation='bilinear')
axes[1].set_title('Word Cloud for Neutral Tweets After Preprocessing', fontsize=20)
axes[1].axis('off')

plt.show()

**6) Splitting Input Matrix Feature and Target Feature**

In [None]:
# Separating the data and label
X = df['stemmed_content']
y = df['category']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=2)

**7) Feature Extraction**

In [None]:
# Converting textual data into numerical
vectorizer = TfidfVectorizer()

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

**8) Model Training**

In [None]:
# Logistic Regression model
lr = LogisticRegression(max_iter=1000)

# Fit
lr.fit(X_train_tfidf, y_train)

# Predictions
y_pred = lr.predict(X_test_tfidf)

**9) Model Evaluation Metrics**

In [None]:
# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

In [None]:
# Print the classification report
print('Classification Report:')
print(classification_report(y_test, y_pred))

In [None]:
# Print the confusion matrix
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

**10) Saving the model**

In [None]:
import pickle
filename = 'tweet_lr_model.pkl'
pickle.dump(lr, open(filename, 'wb'))