In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import re
from sklearn.metrics import accuracy_score,confusion_matrix
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

In [None]:
train_set = pd.read_csv('../../ds_csv/twitter/twitter_training.csv')
train_set.columns = ['id','entity','sentiment','text']

In [None]:

val_set = pd.read_csv('../../ds_csv/twitter/twitter_validation.csv')
val_set.columns = ['id','entity','sentiment','text']

#### Data Analysis

##### 1. sentiment distribution

In [None]:
sentiment_count = train_set['sentiment'].value_counts().reset_index()

In [None]:
fig = go.Figure(data=[go.Pie(labels=sentiment_count['sentiment'],textinfo='percent+value+label', values=sentiment_count['count'],hole=.3)])
fig.update_layout(title='Sentiment Distribution')
fig.update_traces(marker_line_color='white',marker_line_width=1.5)
fig.show('svg')

##### 2. Entity distribution

In [None]:
# top 10 entities 
top10_entity = train_set['entity'].value_counts(sort=True).reset_index().head(10)

In [None]:
fig = px.bar(top10_entity, x=top10_entity['entity'], y=top10_entity['count'],color=top10_entity['count'], title='Top 10 Entities Distribution',color_continuous_scale='Blues',text=top10_entity['count'])
fig.update_traces(marker_line_color='black',marker_line_width=1)
fig.show('svg')

##### 3.sentiment distribution for top 3 entity

In [None]:
top3_entity_name = top10_entity['entity'][:3]
top3_entity = train_set[train_set['entity'].isin(top3_entity_name)]
top3_entity_count = top3_entity.groupby('entity')['sentiment'].value_counts().reset_index()

In [None]:
sentiment_labels = ['Negative', 'Neutral', 'Positive','Irrelevant']
sentiment_colors = ["#50e991","#e60049", "#0bb4ff", "#e6d800"]

In [None]:
fig = make_subplots(rows=1,cols=3,subplot_titles=top3_entity_name,specs=[[{'type':'domain'}, {'type':'domain'}, {'type':'domain'}]],)

In [None]:
top3_entity_count[top3_entity_count['entity'] == 'Microsoft']

In [None]:
for index,name in enumerate(top3_entity_name):
    temp_value = top3_entity_count[top3_entity_count['entity'] == name]['count'].values
    fig.add_trace(go.Pie(labels=sentiment_labels,values=temp_value,marker_colors=sentiment_colors,textinfo='percent+label'),row=1,col=index+1)
fig.update_traces(marker_line_color='white',marker_line_width=1.5)
fig.update_layout(width=1000, height=400)
fig.show('svg')

#### Preprocessing Data

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=1000,stop_words='english')

In [None]:
def preprocess_text(text):
    # Converting text to lowercase 
    text = text.lower()

    # Removing special characters, URLs, and mentions
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@[A-Za-z0-9]+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    return text

##### Train Datset

In [None]:
train_set.dropna(inplace=True)

In [None]:
sentiment_mapping = {'Negative': 0,'Positive': 1,  'Neutral': 2, 'Irrelevant': 3}
train_set['sentiment'] = train_set['sentiment'].map(sentiment_mapping)
train_set = train_set[['text','sentiment']]

In [None]:
train_set['text'] = train_set['text'].apply(preprocess_text)

In [None]:
x_train = vectorizer.fit_transform(train_set['text'])
y_train = train_set['sentiment']

##### Test Dataset

In [None]:
val_set = val_set[['text','sentiment']]

In [None]:
val_set['text'] = val_set['text'].apply(preprocess_text)

In [None]:
x_test = vectorizer.transform(val_set['text'])
y_test = val_set['sentiment'].map(sentiment_mapping)

#### Model Training

##### 1. RandomForest

In [None]:
from sklearn.ensemble import RandomForestClassifier
model_1 = RandomForestClassifier(n_estimators=100,random_state=42)
model_1.fit(x_train,y_train)

In [None]:
ac_1 = accuracy_score(y_test,model_1.predict(x_test))
ac_1

##### 2. DecisionTree 

In [None]:
from sklearn.tree import DecisionTreeClassifier
model_2 = DecisionTreeClassifier(criterion='entropy',random_state=42)
model_2.fit(x_train,y_train)

In [None]:
ac_2 = accuracy_score(y_test,model_2.predict(x_test.toarray()))
ac_2

In [None]:
x_train.shape,x_test.shape