In [20]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\vdm\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vdm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [21]:
# task1: data exploration
import pandas as pd
# from google.colab import drive
# from google.colab import files
# files=files.upload()

# I extended data to 100 rows using chatgpt as only 7 rows are in old text_class.csv
# df = pd.read_csv("/content/text_class_extended.csv") #in google colab
df = pd.read_csv("text_class_extended.csv")
df.head(25)

Unnamed: 0,text,label
0,It arrived late and in bad condition.,negative
1,It arrived late and in bad condition.,negative
2,"Terrible service, I will never shop here again.",negative
3,"Product was damaged when it arrived, very disa...",negative
4,The item broke within a week of use.,negative
5,"Service was standard, nothing to highlight.",neutral
6,"Excellent value for money, I'm impressed.",positive
7,Impressive performance and great service.,positive
8,"The quality is good, but the delivery was late.",neutral
9,"Absolutely wonderful experience, highly recomm...",positive


In [22]:
df.describe()

Unnamed: 0,text,label
count,100,100
unique,28,3
top,It arrived late and in bad condition.,positive
freq,9,34


In [23]:
#total rows
print(f"total rows: {len(df)}")

# count of labels
print("\n", df['label'].value_counts())

total rows: 100

 label
positive    34
negative    33
neutral     33
Name: count, dtype: int64


In [24]:
# all stopwords from library
stop_words = set(stopwords.words('english'))
print(stop_words)

def clean_text(text):

    text = text.lower()

    # removin g punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    return text

{'only', 'by', "she'd", "needn't", 'too', "we've", 'm', 'how', 'what', 'other', 'for', 'shouldn', 'just', 'no', "he'd", 'your', 'their', 'than', "we'll", 'won', 'those', 'are', "couldn't", "should've", "shouldn't", "aren't", 'themselves', 'there', 'when', 'you', 'then', "weren't", "i'd", "it'd", 'll', 'nor', "they'd", 'up', "doesn't", 'wasn', 'y', 'that', "won't", 'before', "it'll", 'why', 'after', 'in', 'and', 'same', 'more', 'ours', "mustn't", "i'll", 'himself', "shan't", "mightn't", 'wouldn', 'has', 'while', 'below', 'few', 's', 'to', "wasn't", 'we', 'me', 'theirs', 'being', "he'll", 'if', 'they', 'any', 'at', "i've", 'ain', 'its', 'about', 'she', 'do', "she'll", 'be', 'is', 'have', 'most', "you'll", 'should', "we're", "hasn't", 'here', 'myself', 'shan', "he's", 'his', 'or', "they've", 'of', 'this', 'had', 'under', "she's", 'needn', "isn't", 'o', 'mightn', 'as', 'does', 'd', 'very', 'which', "they're", 'because', 'both', 'through', 'hasn', "that'll", 'i', "you've", 'it', 'into', "i'

In [25]:
# now applying preprocessing
df['cleaned_step_i'] = df['text'].apply(clean_text)
#cleaned rows
df[['text', 'cleaned_step_i']].head()

Unnamed: 0,text,cleaned_step_i
0,It arrived late and in bad condition.,it arrived late and in bad condition
1,It arrived late and in bad condition.,it arrived late and in bad condition
2,"Terrible service, I will never shop here again.",terrible service i will never shop here again
3,"Product was damaged when it arrived, very disa...",product was damaged when it arrived very disap...
4,The item broke within a week of use.,the item broke within a week of use


In [26]:
df['cleaned_step_ii'] = [word_tokenize(text) for text in df['cleaned_step_i']]
df[['cleaned_step_i', 'cleaned_step_ii']].head()


Unnamed: 0,cleaned_step_i,cleaned_step_ii
0,it arrived late and in bad condition,"[it, arrived, late, and, in, bad, condition]"
1,it arrived late and in bad condition,"[it, arrived, late, and, in, bad, condition]"
2,terrible service i will never shop here again,"[terrible, service, i, will, never, shop, here..."
3,product was damaged when it arrived very disap...,"[product, was, damaged, when, it, arrived, ver..."
4,the item broke within a week of use,"[the, item, broke, within, a, week, of, use]"


In [27]:
df['cleaned_step_iii'] = [[word for word in text if word not in stop_words] for text in df['cleaned_step_ii']]
df[['cleaned_step_ii', 'cleaned_step_iii']].head()

Unnamed: 0,cleaned_step_ii,cleaned_step_iii
0,"[it, arrived, late, and, in, bad, condition]","[arrived, late, bad, condition]"
1,"[it, arrived, late, and, in, bad, condition]","[arrived, late, bad, condition]"
2,"[terrible, service, i, will, never, shop, here...","[terrible, service, never, shop]"
3,"[product, was, damaged, when, it, arrived, ver...","[product, damaged, arrived, disappointed]"
4,"[the, item, broke, within, a, week, of, use]","[item, broke, within, week, use]"


In [28]:
df['final_cleaned_text'] = [' '.join(text) for text in df['cleaned_step_iii']]
df[['cleaned_step_iii', 'final_cleaned_text']].head()

Unnamed: 0,cleaned_step_iii,final_cleaned_text
0,"[arrived, late, bad, condition]",arrived late bad condition
1,"[arrived, late, bad, condition]",arrived late bad condition
2,"[terrible, service, never, shop]",terrible service never shop
3,"[product, damaged, arrived, disappointed]",product damaged arrived disappointed
4,"[item, broke, within, week, use]",item broke within week use


In [29]:
# spliting the data between test and train sets
from sklearn.model_selection import train_test_split

X_input_data = df['final_cleaned_text']

y_input_label = df['label']
print(df[['final_cleaned_text','label']])

X_train, X_test, y_train, y_test = train_test_split(X_input_data, y_input_label, test_size=0.2, random_state=42)
print(f"\n\nX_train: {X_train}")
print(f"\n\nX_test: {X_test}")
print(f"\n\ny_train: {y_train}")
print(f"\n\ny_test: {y_test}")

                      final_cleaned_text     label
0             arrived late bad condition  negative
1             arrived late bad condition  negative
2            terrible service never shop  negative
3   product damaged arrived disappointed  negative
4             item broke within week use  negative
..                                   ...       ...
95  impressive performance great service  positive
96                 loved product amazing  positive
97            arrived late bad condition  negative
98            arrived late bad condition  negative
99  product damaged arrived disappointed  negative

[100 rows x 2 columns]


X_train: 55                 customer support helpful polite
88                    worst purchase ive ever made
26            impressive performance great service
42                              happy purchase buy
69            impressive performance great service
                          ...                     
60    received defective item complete waste mo

In [30]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

#vectorizing
vectorizer = CountVectorizer()

X_train_vec = vectorizer.fit_transform(X_train)
# print(f"X_train_vec: {X_train_vec}")

X_test_vec = vectorizer.transform(X_test)
# print(f"X_test_vec: {X_test_vec}")
#training model
model = LogisticRegression()

model.fit(X_train_vec, y_train)


In [31]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_test_vec)

acc = accuracy_score(y_test, y_pred)

print("accuracy:", acc)

# increasing rows in the dataset improved accuracy to 0.95 from 0.5 previously

accuracy: 0.95


In [32]:
from sklearn.metrics import confusion_matrix, classification_report

print("confusion-matrix:")
print(confusion_matrix(y_test, y_pred))

print("\n\nreport:")
print(classification_report(y_test, y_pred))


confusion-matrix:
[[7 0 0]
 [0 6 1]
 [0 0 6]]


report:
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00         7
     neutral       1.00      0.86      0.92         7
    positive       0.86      1.00      0.92         6

    accuracy                           0.95        20
   macro avg       0.95      0.95      0.95        20
weighted avg       0.96      0.95      0.95        20

