In [1]:
pip install spacy

Note: you may need to restart the kernel to use updated packages.


In [2]:
!python -m spacy download en_core_web_sm

[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from matplotlib import pyplot as plt

df_review = dd.read_csv(r'C:\Users\thais\Google Drive\Ciencia de dados e Big data\13.TCC\archive\20191226-reviews.csv')
df_item = dd.read_csv(r'C:\Users\thais\Google Drive\Ciencia de dados e Big data\13.TCC\archive\20191226-items.csv')

In [4]:
df=df_review.merge(df_item[['asin', 'brand']])
df=df.drop(['name','date','verified','title','helpfulVotes'], axis=1)
df=df.dropna(subset=['rating', 'body', 'brand'])

In [5]:
import re
import spacy
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words("english")
nlp = spacy.load("en_core_web_sm")

def lemmatize(text, nlp=nlp):  
    doc = nlp(" ".join(text))
    lemmatized = [token.lemma_ for token in doc]
    
    return lemmatized

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    return list(
        filter(lambda word: word.isalnum(), tokens)
    )
def remove_stopwords(words):
    filtered = filter(lambda word: word not in stop_words, words)
    return list(filtered)

def remove_newlinechars(text):
    return re.sub(r'\s+', ' ', text)

def remove_numbers(text):
    return re.sub(r'\d+', '', text) 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\thais\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
from dask.diagnostics import ProgressBar
with ProgressBar():
    df['cleaned_text']=df['body'].map(
        lambda text:str(text).lower()).map(
        remove_numbers).map(
        remove_newlinechars).map(
        tokenize).map(
        remove_stopwords).map(
        lemmatize).map(
        ' '.join)

In [7]:
df['opinion']=df['rating'].apply(lambda x: 'positive' if x >= 3 else 'negative', meta=float)

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
x = df['cleaned_text']
y = df['opinion']

with ProgressBar():
    X_train, X_test, y_train, y_test = train_test_split(
        x.compute(),y.compute(),test_size=0.33,random_state=42)

[########################################] | 100% Completed | 22min 23.8s
[########################################] | 100% Completed | 17min 17.8s


In [16]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

LinearSVC_text = Pipeline([('tfidf', TfidfVectorizer()),(
    'clf', LinearSVC())])
LinearSVC_text.fit(X_train,y_train)
LinearSVC_predictions = LinearSVC_text.predict(X_test)
LinearSVC_cm = confusion_matrix(y_test,LinearSVC_predictions)
print(classification_report(y_test,LinearSVC_predictions))

              precision    recall  f1-score   support

    negative       0.78      0.74      0.76      5397
    positive       0.92      0.93      0.93     16966

    accuracy                           0.89     22363
   macro avg       0.85      0.83      0.84     22363
weighted avg       0.88      0.89      0.89     22363



In [18]:
from sklearn.naive_bayes import MultinomialNB

MultinomialNB_text = Pipeline([('tfidf',TfidfVectorizer()),(
    'clf',MultinomialNB())])
MultinomialNB_text.fit(X_train,y_train)
MultinomialNB_predictions = MultinomialNB_text.predict(X_test)
cm = confusion_matrix(y_test,MultinomialNB_predictions)
print(classification_report(y_test,MultinomialNB_predictions))

              precision    recall  f1-score   support

    negative       0.90      0.38      0.54      5397
    positive       0.83      0.99      0.90     16966

    accuracy                           0.84     22363
   macro avg       0.86      0.68      0.72     22363
weighted avg       0.85      0.84      0.81     22363



In [20]:
from sklearn.ensemble import RandomForestClassifier

RandomForest_text = Pipeline([('tfidf', TfidfVectorizer()),(
    'clf', RandomForestClassifier())])
RandomForest_text.fit(X_train,y_train)
RandomForest_predictions = RandomForest_text.predict(X_test)
RandomForest_cm = confusion_matrix(y_test,RandomForest_predictions)
print(classification_report(y_test,RandomForest_predictions))

              precision    recall  f1-score   support

    negative       0.85      0.64      0.73      5397
    positive       0.89      0.96      0.93     16966

    accuracy                           0.89     22363
   macro avg       0.87      0.80      0.83     22363
weighted avg       0.88      0.89      0.88     22363



In [21]:
from sklearn.linear_model import LogisticRegression

LogisticRegression_text = Pipeline([('tfidf',TfidfVectorizer()),(
    'clf',LogisticRegression(max_iter=200))])
LogisticRegression_text.fit(X_train,y_train)
LogisticRegression_predictions = LogisticRegression_text.predict(X_test)
LogisticRegression_cm = confusion_matrix(y_test,LogisticRegression_predictions)
print(classification_report(y_test,LogisticRegression_predictions))

              precision    recall  f1-score   support

    negative       0.81      0.71      0.76      5397
    positive       0.91      0.95      0.93     16966

    accuracy                           0.89     22363
   macro avg       0.86      0.83      0.84     22363
weighted avg       0.89      0.89      0.89     22363



In [24]:

from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline

pipeline = Pipeline([('tfidf', TfidfVectorizer()),(
    'clf', OneVsRestClassifier(LinearSVC())),])

parameters = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'clf__estimator__C': [0.01, 0.1, 1],
    'clf__estimator__class_weight': ['balanced', None],
}

grid_search_tune = GridSearchCV(
    pipeline, parameters, cv=2, n_jobs=2, verbose=3)
grid_search_tune.fit(X_train, y_train)

print("Best parameters set:")
print (grid_search_tune.best_estimator_.steps)

print ("Applying best classifier on test data:")
best_clf = grid_search_tune.best_estimator_
predictions = best_clf.predict(X_test)

print (classification_report(y_test, predictions))

Fitting 2 folds for each of 54 candidates, totalling 108 fits
Best parameters set:
[('tfidf', TfidfVectorizer(max_df=0.25, ngram_range=(1, 2))), ('clf', OneVsRestClassifier(estimator=LinearSVC(C=1)))]
Applying best classifier on test data:
              precision    recall  f1-score   support

    negative       0.81      0.77      0.79      5397
    positive       0.93      0.94      0.93     16966

    accuracy                           0.90     22363
   macro avg       0.87      0.86      0.86     22363
weighted avg       0.90      0.90      0.90     22363



In [26]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'))),
])
parameters = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    "clf__estimator__C": [0.01, 0.1, 1],
    "clf__estimator__class_weight": ['balanced', None],
}

grid_search_tune = GridSearchCV(
    pipeline, parameters, cv=2, n_jobs=2, verbose=3)
grid_search_tune.fit(X_train, y_train)

print("Best parameters set:")
print (grid_search_tune.best_estimator_.steps)

print ("Applying best classifier on test data:")
best_clf = grid_search_tune.best_estimator_
predictions = best_clf.predict(X_test)

print (classification_report(y_test, predictions))

Fitting 2 folds for each of 54 candidates, totalling 108 fits
Best parameters set:
[('tfidf', TfidfVectorizer(max_df=0.25)), ('clf', OneVsRestClassifier(estimator=LogisticRegression(C=1, solver='sag')))]
Applying best classifier on test data:
              precision    recall  f1-score   support

    negative       0.81      0.71      0.76      5397
    positive       0.91      0.95      0.93     16966

    accuracy                           0.89     22363
   macro avg       0.86      0.83      0.84     22363
weighted avg       0.89      0.89      0.89     22363

