<a href="https://colab.research.google.com/github/teddytoken/Sentiment-Analysis-on-Twitter-Data/blob/master/TwitterSentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# utilities :
import re # regular expression library
import pandas as pd

In [None]:
cols = ['id', 'src', 'label', 'tweet']
df_train = pd.read_csv(('/content/sample_data/twitter_training.csv'), names= cols)
df_valid= pd.read_csv(('/content/sample_data/twitter_validation.csv'), names= cols)
print(df_train.head())

In [None]:
print(df_train.info())

In [None]:
df_train.columns

In [None]:
df_train.shape

In [None]:
# Checking for Null values :
print(f'train nulls:', df_train.isna().sum())

print(f'validation nulls:', df_valid.isna().sum())

In [None]:
df_train= df_train.dropna(subset=['tweet'])

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.countplot(x='label', data=df_train, color='orange')
plt.show()

In [None]:
from wordcloud import WordCloud
plt.figure(figsize=(5, 5))
wc = WordCloud(max_words=100, width=1600, height=800).generate(" ".join(df_train['tweet']))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')  # Hide axes for better visualization
plt.show()

# Preprocessing

In [None]:
import spacy as spc
tokenizer = spc.load('en_core_web_sm')

def preprocessing_pipe(texts):
    docs = tokenizer.pipe(
        texts,
        n_process=4,  # No of CPUs
        batch_size=64,
    )
    for doc in docs:
        yield " ".join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

# Vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
def vectorization(tratxt, valtxt=None):
    vect = TfidfVectorizer()
    x_train = vect.fit_transform(tratxt)
    if valtxt is not None:
        x_val = vect.transform(valtxt)
        return x_train, x_val, vect
    return x_train, vect

In [None]:
print(df_train.shape)
print(df_valid.shape)

# Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_train['label'] = le.fit_transform(df_train['label'])
df_valid['label'] = le.transform(df_valid['label'])

In [None]:
from tqdm import tqdm
from sklearn.model_selection import train_test_split
# Applying preprocessing to each tweet in the Series
tqdm.pandas()
df_train['cleanedtweet'] = list(preprocessing_pipe(df_train['tweet']))
df_valid['cleanedtweet'] = list(preprocessing_pipe(df_valid['tweet']))

x, vect = vectorization(df_train['cleanedtweet'])
y = df_train['label']

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)

# Applying preprocessing to each tweet in the Series
x_test = vect.transform(df_valid['cleanedtweet'])
y_test = df_valid['label']

In [None]:
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential

model = Sequential(
    [
        Dense(64, activation='relu', input_dim=x_train.shape[1]),
        Dense(32, activation='relu'),
        Dense(y_train.nunique(), activation='softmax')
    ]
)

In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(x_train, y_train, epochs=10, validation_data=(x_val, y_val))

In [None]:
model.evaluate(x_test, y_test)

# Model Evaluation

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

y_preds = model.predict(x_test)
y_pred_classes = np.argmax(y_preds, axis=1) # Convert probabilities to class labels
print(classification_report(y_test, y_pred_classes))

In [None]:
cm = confusion_matrix(y_test, y_pred_classes)
plt.figure(figsize=(6, 5))
sns.heatmap(cm,
            annot=True,
            fmt='d',
            cmap='viridis',
            xticklabels=le.classes_,  # Use the encoder's stored class names
            yticklabels=le.classes_)  # Use the encoder's stored class names

plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
TestingTxt = 'My Friend Pedro is the best'
pre_txt = preprocessing_pipe([TestingTxt])
vect_txt = vect.transform(pre_txt)
pred = model.predict(vect_txt)

In [None]:
testingresult = le.inverse_transform([np.argmax(pred)])
testingresult