In [79]:
import pandas as pd
import re
import string, nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

### Data Preprocessing

In [80]:
#Load the dataset

df = pd.read_csv('C:/Programming/Datasets/chat_dataset.csv')

In [81]:
df.head()

Unnamed: 0,message,sentiment
0,I really enjoyed the movie,positive
1,The food was terrible,negative
2,I'm not sure how I feel about this,neutral
3,The service was excellent,positive
4,I had a bad experience,negative


In [82]:
#Replace text labels with numbers

label_mapping = {'positive':1, 'negative':0, 'neutral':-1}
df['sentiment'] = df['sentiment'].replace(label_mapping)

In [83]:
df.head(4)

Unnamed: 0,message,sentiment
0,I really enjoyed the movie,1
1,The food was terrible,0
2,I'm not sure how I feel about this,-1
3,The service was excellent,1


In [84]:
#Text cleaning

def clean_text (text):
    # A function for basic text cleaning tasks
    
    # convert text to lowercase
    text = text.lower()
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove URLs
    text = re.sub(r'htpp\S+|www\S+','', text)
    
    # Remove punctuation
    text = text.translate(str.maketrans('','', string.punctuation))
    
    # Remove numbers
    text = re.sub(r'\d+','', text)
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Stopwords removal
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    
    # Re-join tokens into a string
    cleaned_text = ' '.join(tokens)
    
    return text

In [85]:
df['message'] = df['message'].apply(clean_text)

In [86]:
df.head()

Unnamed: 0,message,sentiment
0,i really enjoyed the movie,1
1,the food was terrible,0
2,im not sure how i feel about this,-1
3,the service was excellent,1
4,i had a bad experience,0


#### Feature Extraction

In [87]:
#split dataset

X_train, X_test, y_train, y_test = train_test_split(df['message'], df['sentiment'],test_size=0.2, random_state=42)

In [88]:
#Vectorization

vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

#### Training the ML model 1 (Naive Bayes)

In [89]:
model = MultinomialNB()
model.fit(X_train_vec, y_train)

#### Model Evaluation

In [90]:
predictions = model.predict(X_test_vec)
print(f"Accuracy: {accuracy_score(y_test, predictions)}")
print(classification_report(y_test, predictions))

Accuracy: 0.7777777777777778
              precision    recall  f1-score   support

          -1       0.70      0.96      0.81        54
           0       1.00      0.37      0.54        30
           1       0.88      0.85      0.86        33

    accuracy                           0.78       117
   macro avg       0.86      0.73      0.74       117
weighted avg       0.83      0.78      0.76       117



In [92]:
#Training SVM model

model = SVC(kernel='linear')
model.fit(X_train_vec, y_train)

In [94]:
#Evaluate the model

predictions = model.predict(X_test_vec)
print(f"Accuracy: {accuracy_score(y_test, predictions)}")
print(classification_report(y_test, predictions))

Accuracy: 0.8632478632478633
              precision    recall  f1-score   support

          -1       0.89      0.94      0.92        54
           0       0.91      0.67      0.77        30
           1       0.79      0.91      0.85        33

    accuracy                           0.86       117
   macro avg       0.86      0.84      0.84       117
weighted avg       0.87      0.86      0.86       117



In [95]:
sample_texts = ["This product is great!", "I am not happy with this service."]
sample_texts_transformed = vectorizer.transform(sample_texts)
predictions = model.predict(sample_texts_transformed)

In [96]:
for text, sentiment in zip(sample_texts, predictions):
    print(f"Text:{text}\nPredicted Sentiment: {'Positive' if sentiment == 1 else 'Negative'}\n")

Text:This product is great!
Predicted Sentiment: Positive

Text:I am not happy with this service.
Predicted Sentiment: Negative

