<a href="https://colab.research.google.com/github/walterwhites/machine_learning/blob/main/Analyse%20Customer%20Reviews%20with%20Natural%20Language%20Processing(NLP).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Text Preprocessing

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

nltk.download('punkt')
nltk.download('stopwords')

def preprocess(text):
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens if token not in string.punctuation]
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


###Text Representation

In [82]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import csv

!wget -cv https://raw.githubusercontent.com/walterwhites/machine_learning/main/customer_reviews_wide.csv

data = pd.read_csv('customer_reviews_wide.csv')
texts = data['review'].tolist()
feeling = data['feeling'].tolist()

preprocessed_texts = [preprocess(text) for text in texts]

countVectorizer = CountVectorizer()
X = countVectorizer.fit_transform(preprocessed_texts)


--2023-06-26 15:43:53--  https://raw.githubusercontent.com/walterwhites/machine_learning/main/customer_reviews_wide.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 71781 (70K) [text/plain]
Saving to: ‘customer_reviews_wide.csv’


2023-06-26 15:43:53 (8.88 MB/s) - ‘customer_reviews_wide.csv’ saved [71781/71781]



### Train the model

In [90]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, feeling, test_size=0.3, random_state=40)

model = LogisticRegression()
model.fit(X_train, y_train)

### Evaluate the model

In [91]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.97      0.94      0.95        96
     neutral       0.92      0.99      0.96        99
    positive       0.95      0.91      0.93       102

    accuracy                           0.95       297
   macro avg       0.95      0.95      0.95       297
weighted avg       0.95      0.95      0.95       297



###Predict new feeling

In [94]:
def predict_feeling(text):
    preprocessed_text = preprocess(text)
    text_representation = countVectorizer.transform([preprocessed_text])
    feeling = model.predict(text_representation)[0]
    return feeling

john_feeling = "This website is normal."
paul_feeling = "I am so happy, the product I received is exceptional."
george_feeling = "I did not like the product I received, I asked for a refund"

predicted_feeling_john = predict_feeling(john_feeling)
predicted_feeling_paul = predict_feeling(paul_feeling)
predicted_feeling_george = predict_feeling(george_feeling)

print(predicted_feeling_john)
print(predicted_feeling_paul)
print(predicted_feeling_george)


neutral
positive
negative
