In [None]:
# # Experiment Notebook for NLP Chatbot

# This notebook documents the experiments conducted to improve the performance of the NLP chatbot.
# It includes data analysis, model evaluation, and visualization of results.


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, confusion_matrix
import json

In [None]:
# Load the training data
training_data = pd.read_csv('training_data.csv')
# Load evaluation data
eval_data = pd.read_json('eval_data.json')

# Display the first few rows of the training data
training_data.head()

In [None]:
# Check for missing values
print(training_data.isnull().sum())

# Visualize the distribution of sentiments in the training data
sns.countplot(x='sentiment', data=training_data)
plt.title('Sentiment Distribution')
plt.show()

# Display word cloud for common words in training messages
from wordcloud import WordCloud

text = ' '.join(training_data['message'])
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Messages')
plt.show()

In [None]:
# Example: Load a pre-trained model (e.g., BERT)
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# Tokenize the input
train_encodings = tokenizer(list(training_data['message']), truncation=True, padding=True)

# Train the model (pseudo code, actual training code will vary)
# training_args = TrainingArguments(...)
# trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset)
# trainer.train()

# Evaluate the model on evaluation data (pseudo code)
# predictions = trainer.predict(eval_dataset)

# Calculate accuracy
# accuracy = accuracy_score(eval_data['true_labels'], predictions.argmax(axis=1))
# print(f'Accuracy: {accuracy * 100:.2f}%')


In [None]:
# Confusion Matrix
confusion_mat = confusion_matrix(eval_data['true_labels'], predictions.argmax(axis=1))

plt.figure(figsize=(8, 6))
sns.heatmap(confusion_mat, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Negative', 'Neutral', 'Positive'], 
            yticklabels=['Negative', 'Neutral', 'Positive'])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
plt.show()


In [None]:
# Display sentiment analysis results
sentiment_results = pd.DataFrame({
    'Message': eval_data['message'],
    'Predicted Sentiment': predictions.argmax(axis=1),
    'True Sentiment': eval_data['true_labels']
})

sentiment_results.head()
