In [1]:
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
#from plotly import tools
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import classification_report, confusion_matrix
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from collections import Counter
from nltk.tokenize import RegexpTokenizer
import re
import string
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Import dataset urdu dataset
data = pd.read_csv('Tweets.csv')
data.head()

In [None]:
data.info()

## Data Preprocessing

In [None]:
!pip install emoji

In [None]:
import re
import emoji

def clean_text(text):
    # Ensure text is a string
    text = str(text)

    # Removal of hashtags, HTML tags, mentions, punctuations, and URLs
    text = re.sub(r'#\w+', '', text)             # Remove hashtags
    text = re.sub(r'<.*?>', '', text)            # Remove HTML tags
    text = re.sub(r'@[A-Za-z0-9_]+', '', text)   # Remove mentions numbers
    text = re.sub(r'[^\w\s]', '', text)          # Remove punctuations
    text = re.sub(r'http\S+', '', text)          # Remove URLs
    text = re.sub(r'(.)\1+', r'\1', text)          # Remove repeating charactersz
    text = re.sub('\w*\d\w*', '', text)            # remove words containing numbers
    # Changing to lowercase
    text = text.lower()

    # Replace emoji with corresponding text representation
    text = emoji.demojize(text)

    return text


In [None]:
data['text']= data['text'].apply(lambda x: clean_text(x))
data['text'].head(10)

## Data Visulization

In [None]:
import matplotlib.pyplot as plt

# Plot sentiment distribution as a pie chart
plt.figure(figsize=(8, 6))
data['sentiment'].value_counts().plot(kind='pie', autopct='%1.1f%%', colors=['skyblue', 'lightgreen', 'lightcoral'])
plt.title('Distribution of Sentiments in Tweets')
plt.ylabel('')  # Hide the y-axis label
plt.show()


In [None]:
import seaborn as sns

# Plot sentiment distribution as a bar chart
plt.figure(figsize=(8, 6))
sns.countplot(x='sentiment', data=data, palette='pastel')
plt.title('Distribution of Sentiments in Tweets')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()


In [None]:
from wordcloud import WordCloud
# Initialize a subplot grid
fig, axes = plt.subplots(nrows=1, ncols=len(data['sentiment'].unique()), figsize=(15, 5))

# Iterate over unique sentiment categories
for i, sentiment in enumerate(data['sentiment'].unique()):
    subset = data[data['sentiment'] == sentiment]
    text = " ".join(subset['text'].tolist())

    # Generate word cloud
    wordcloud = WordCloud(width=400, height=200, background_color='white').generate(text)

    # Plot word cloud in the corresponding subplot
    axes[i].imshow(wordcloud, interpolation='bilinear')
    axes[i].set_title(f'Most Common Words in {sentiment.capitalize()} Tweets')
    axes[i].axis("off")

# Adjust layout and display plot
plt.tight_layout()
plt.show()


## BERT Implementation

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import AdamW


In [None]:
from sklearn.preprocessing import LabelEncoder
# Create a LabelEncoder
label_encoder = LabelEncoder()

# Fit the LabelEncoder on the 'sentiment' column
data['encoded_sentiment'] = label_encoder.fit_transform(data['sentiment'])

# Checking encoding
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(label_mapping)
data.head()

In [None]:
# select the X with text and y with target class
X = data['text']
y = data['encoded_sentiment']

In [None]:
# Split the data into training and testing sets (70/30 ratio with stratified sampling)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42,stratify=data['encoded_sentiment'])

In [None]:
# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)  # Assuming binary classification

In [None]:
# Tokenize and preprocess the training data
train_inputs = tokenizer(list(X_train), padding=True, truncation=True, return_tensors='pt', max_length=128)
train_labels = torch.tensor(list(y_train))

# Tokenize and preprocess the testing data
test_inputs = tokenizer(list(X_test), padding=True, truncation=True, return_tensors='pt', max_length=128)
test_labels = torch.tensor(list(y_test))

In [None]:
# Fine-tune the BERT model
optimizer = AdamW(model.parameters(), lr=5e-5)
loss_fn = torch.nn.CrossEntropyLoss()
epochs = 3

for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(**train_inputs, labels=train_labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()

# Evaluate the model on the test set
model.eval()
with torch.no_grad():
    outputs = model(**test_inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=1)
    y_pred = predictions.numpy()

In [None]:
# Change target_names based on your class labels
target_names = ['negative 0', 'neutral 1', 'positive 2']

# Generate and print the classification report
report = classification_report(test_labels.numpy(), y_pred, digits=4, target_names=target_names)
print(report)