<a href="https://colab.research.google.com/github/Murphy-Mary/MyProject/blob/main/Twitter_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import data

souce :https://www.kaggle.com/datasets/jp797498e/twitter-entity-sentiment-analysis

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

import os
os.environ['KAGGLE_CONFIG_DIR'] = "/content/gdrive/My Drive/Kaggle"

!kaggle datasets download -d jp797498e/twitter-entity-sentiment-analysis
!unzip /content/twitter-entity-sentiment-analysis.zip && rm /content/twitter-entity-sentiment-analysis.zip

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
Downloading twitter-entity-sentiment-analysis.zip to /content
  0% 0.00/1.99M [00:00<?, ?B/s]
100% 1.99M/1.99M [00:00<00:00, 106MB/s]
Archive:  /content/twitter-entity-sentiment-analysis.zip
replace twitter_training.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
!pip install -q -U --pre pycaret
!python -m venv .env
!source /content/.env/bin
!pip install -U pip setuptools wheel
!pip install -U 'spacy[cuda-autodetect]'
!python -m spacy download en_core_web_sm

In [None]:
import numpy as np
import pandas as pd
import spacy
from tqdm.auto import tqdm
import time
from sklearn.metrics import accuracy_score

In [None]:
print(spacy.prefer_gpu())

## data

In [None]:
df = pd.read_csv('/content/twitter_training.csv',header=None)

In [None]:
df = df.rename(columns={0: "Id",1:"entity",2:"sentiment",3:"Tweet_content"})

In [None]:
df

# EDA

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
 df = df[df['Tweet_content'].notna()]

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.duplicated().sum()

In [None]:
df.loc[df["sentiment"] == "Negative",'new_sent'] = 0
df.loc[df["sentiment"] == "Positive",'new_sent'] = 1
df.loc[df["sentiment"] == "Neutral",'new_sent'] = 2
df.loc[df["sentiment"] == "Irrelevant",'new_sent'] = 3
# df.loc[df["sentiment"] != "Neutral" and df["sentiment"] != "Positive" and df["sentiment"] != "Negative",'new_sent'] = 3

In [None]:
df.isna().sum()

In [None]:
df['new_sent'] = df['new_sent'].fillna(4)

In [None]:
df

#prepare

In [None]:
# Load the English NLP model
spacy.prefer_gpu() # or spacy.require_gpu()
nlp = spacy.load('en_core_web_sm')

# Define stop words
stop_words = spacy.lang.en.stop_words.STOP_WORDS

In [None]:
def preprocess_text(text):
    # Parse the text with Spacy
    doc = nlp(text)
    
    # Lemmatize the tokens and remove stop words
    lemmas = [token.lemma_.lower() for token in doc if not token.is_stop]
    
    # Join the lemmas back into a string and return it
    return " ".join(lemmas)

In [None]:
%%time

tqdm.pandas()

df['Tweet_content'] = df['Tweet_content'].progress_apply(preprocess_text)

In [None]:
df

# Vectorizing the Text Data

In [None]:
from sklearn.model_selection import train_test_split

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['Tweet_content'], df['new_sent'], test_size=0.2, random_state=248362023)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# create bag-of-words features
vectorizer = CountVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

# ML


## Naive Bayes Classifier

In [None]:
from sklearn.naive_bayes import MultinomialNB
# train a Naive Bayes classifier
clf = MultinomialNB()
clf

In [None]:
clf.fit(X_train_vect, y_train)

## RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf = RandomForestClassifier(n_estimators=10)
clf

In [None]:
clf.fit(X_train_vect, y_train)

## neural_network 

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
clf = MLPClassifier(solver='adam', alpha=1e-5,hidden_layer_sizes=(2,), random_state=746563)
#5,2
#10,2
#10,5

In [None]:
clf.fit(X_train_vect, y_train)

## Save model

In [None]:
import pickle

filename = "NN_Model.pickle"
# pickle.dump(clf, open(filename, "wb"))
with open('/content/gdrive/MyDrive/sizeproject/'+ filename , 'wb') as f:
    pickle.dump(clf, f)
# loaded_model = pickle.load(open(filename, "rb"))

# Predictions Section

In [None]:
# evaluate the classifier on the test set
y_pred = clf.predict(X_test_vect)

In [None]:
y_pred

In [None]:
acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)

# Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
import seaborn as sns
sns.heatmap(cm, annot=True)

# Classification Report

In [None]:
from sklearn.metrics import classification_report

# generate classification report
target_names = ['Negative', 'Positive','Neutral','Irrelevant']
print(classification_report(y_test, y_pred, target_names=target_names))

NN
non - preproce


```
precision    recall  f1-score   support

    Negative       0.82      0.87      0.85      4297
    Positive       0.84      0.82      0.83      4009
     Neutral       0.84      0.75      0.79      3592
  Irrelevant       0.68      0.73      0.70      2434

    accuracy                           0.80     14332
   macro avg       0.79      0.79      0.79     14332
weighted avg       0.81      0.80      0.80     14332
```


```
precision    recall  f1-score   support

    Negative       0.82      0.79      0.81      4297
    Positive       0.65      0.73      0.69      4009
     Neutral       0.85      0.73      0.79      3592
  Irrelevant       0.47      0.49      0.48      2434

    accuracy                           0.71     14332
   macro avg       0.70      0.69      0.69     14332
weighted avg       0.72      0.71      0.71     14332
```


```
 precision    recall  f1-score   support

    Negative       0.81      0.88      0.85      4297
    Positive       0.84      0.84      0.84      4009
     Neutral       0.87      0.79      0.83      3592
  Irrelevant       0.82      0.80      0.81      2434

    accuracy                           0.83     14332
   macro avg       0.84      0.83      0.83     14332
weighted avg       0.84      0.83      0.83     14332
```





prepro


```
    precision    recall  f1-score   support

    Negative       0.30      1.00      0.46      4297
    Positive       0.00      0.00      0.00      4009
     Neutral       0.00      0.00      0.00      3592
  Irrelevant       0.00      0.00      0.00      2434

    accuracy                           0.30     14332
   macro avg       0.07      0.25      0.12     14332
weighted avg       0.09      0.30      0.14     14332
```



  
NO NLP
```
              precision    recall  f1-score   support

    Negative       0.70      0.86      0.77      4297
    Positive       0.73      0.81      0.76      4009
     Neutral       0.83      0.63      0.72      3592
       other       0.81      0.62      0.70      2434

    accuracy                           0.75     14332
    macro avg       0.77      0.73      0.74     14332
    weighted avg       0.76      0.75      0.74     14332
```


```
     precision    recall  f1-score   support

    Negative       0.92      0.95      0.93      4297
    Positive       0.91      0.93      0.92      4009
     Neutral       0.91      0.92      0.92      3592
       other       0.98      0.87      0.92      2434

    accuracy                           0.92     14332
   macro avg       0.93      0.92      0.92     14332
weighted avg       0.92      0.92      0.92     14332
```


NLP


```
 precision    recall  f1-score   support

    Negative       0.68      0.85      0.76      4297
    Positive       0.71      0.78      0.75      4009
     Neutral       0.81      0.64      0.71      3592
       other       0.81      0.61      0.70      2434

    accuracy                           0.74     14332
   macro avg       0.76      0.72      0.73     14332
weighted avg       0.75      0.74      0.73     14332
```
RDF 
n = 10

```
     precision    recall  f1-score   support

    Negative       0.88      0.91      0.89      4297
    Positive       0.82      0.92      0.86      4009
     Neutral       0.92      0.85      0.88      3592
       other       0.94      0.81      0.87      2434

    accuracy                           0.88     14332
   macro avg       0.89      0.87      0.88     14332
weighted avg       0.88      0.88      0.88     14332

```
n = 100


```
  precision    recall  f1-score   support

    Negative       0.91      0.92      0.92      4297
    Positive       0.85      0.93      0.89      4009
     Neutral       0.94      0.88      0.91      3592
       other       0.96      0.85      0.90      2434

    accuracy                           0.90     14332
   macro avg       0.91      0.90      0.90     14332
weighted avg       0.91      0.90      0.90     14332
```











# validate

In [None]:
test = pd.read_csv('/content/twitter_validation.csv',header=None)

In [None]:
test = test.rename(columns={0: "Id",1:"entity",2:"sentiment",3:"Tweet_content"})

In [None]:
vtest=test['new_sent']

In [None]:
pred = clf.predict(test)

In [None]:
pred