In [None]:
import pandas as pd
import json
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
sns.set_style("darkgrid", {"axes.facecolor": ".9"})

In [None]:
with open("C:/Users/ryass.DESKTOP-0A1NS33/OneDrive/practice_python/scrap_data/amazon/amazon.json") as f:
    data = json.load(f)


df_coffee_reviews = pd.DataFrame(data)



In [None]:

df_coffee_reviews['product_encode']=pd.get_dummies(df_coffee_reviews['product']).values.argmax(1)
print(df_coffee_reviews['product_encode'].value_counts())
print(df_coffee_reviews['product'].value_counts())



In [None]:


print(df_coffee_reviews.shape)
print(df_coffee_reviews.columns)
df_coffee_reviews.info()
print(df_coffee_reviews.describe(include='object'))
df_coffee_reviews['stars'] = df_coffee_reviews.stars.astype('float64')
df_coffee_reviews['date'] = pd.to_datetime(df_coffee_reviews['date'])


In [None]:
df_coffee_reviews.groupby(by='product')['name'].count().plot.bar()
#sns.boxplot(data=df_coffee_reviews, y='stars', x='date',hue='product')

In [None]:
df_coffee_reviews.groupby(by=['product',df_coffee_reviews['date'].dt.year]).aggregate({'stars':'mean'}).unstack().T.plot.bar()

In [None]:
df_coffee_reviews.groupby(by=['product',df_coffee_reviews['date'].dt.year]).count()['name'].unstack().T.plot.bar(xlabel='Date',ylabel='Counts') 

In [None]:
df_coffee_reviews.groupby(by=['date','stars'])['name'].count().to_frame(name='counts').plot(xlabel='Date,Stars',ylabel='Counts')

plt.xticks(rotation='45');

In [None]:
import sklearn
import re
from sklearn.feature_extraction.text import CountVectorizer
import string
import nltk
nltk.download('punkt')
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans, DBSCAN,AgglomerativeClustering
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.model_selection import GridSearchCV
from nltk.stem import WordNetLemmatizer,SnowballStemmer
nltk.download('wordnet')
from wordcloud import WordCloud

In [None]:

review_train=df_coffee_reviews['review'].str.lower().str.translate(str.maketrans('', '', string.punctuation)).values


In [None]:

porter_stemmer=PorterStemmer()
lemmatizer = WordNetLemmatizer()
snowball_stemmer=SnowballStemmer("english",ignore_stopwords=True)

def my_text_preprocessor(text):
    
    text=text.lower() 
    text=re.sub("\\W"," ",text) # remove special chars
    text=re.sub("[^A-Za-z]+"," ", text)
    text=re.sub("\\s+(in|the|all|for|and|on|you|we|your)\\s+"," ",text) 
    
    # stem words
    words=re.split("\\s+",text)
    stemmed_words=[porter_stemmer.stem(word=word) for word in words if word.isalpha()]
    #snowball
    snowballstemmer_words=[snowball_stemmer.stem(word) for word in words if word.isalpha()] 
    #lematize words
    lemmat_words=[lemmatizer.lemmatize(word=word) for word in words if word.isalpha()]
    
    #return ' '.join(stemmed_words)
    return ' '.join(lemmat_words)
    
stops=stopwords.words('english')+['year','would','whole','work','wonder','valu','ha','doe','wa','yet','without','yes','your']   

In [None]:
review_vec = CountVectorizer(stop_words=stops,min_df=10,
tokenizer=nltk.word_tokenize,max_features = 500, preprocessor=my_text_preprocessor)     
review_counts = review_vec.fit_transform(review_train)

In [None]:
d=review_vec.vocabulary_
sorted(d.items(), key=lambda x: x[1], reverse=True)

In [None]:
print(review_counts.shape)

review_counts[0].toarray()

In [None]:
tfidf_transformer = TfidfTransformer(smooth_idf=True,use_idf=True)
review_tfidf = tfidf_transformer.fit_transform(review_counts)

In [None]:
# Same dimensions, now with tf-idf values instead of raw frequency counts

print(review_tfidf.shape)


In [None]:
n=5
df_TF = pd.DataFrame(review_tfidf[:n].T.todense(), index=review_vec.get_feature_names(), columns=[str("TF-IDF:"+str(i)) for i in range(n)])
df_TF = df_TF.sort_values( df_TF.columns.values.tolist(),ascending=False)
print(df_TF.shape)
print (df_TF.head(15))

In [None]:
review_tfidf_array=review_tfidf.toarray()
review_count_array=review_counts.todense()

In [None]:
input_array='review_tfidf_array'
kmeans= KMeans()
truncatedsvd= TruncatedSVD()
pipeline = make_pipeline(truncatedsvd,kmeans)
param_grid={"truncatedsvd__n_components":[20,10,5,4,3,2],"kmeans__n_clusters":[5,6,3]},
pip_search=GridSearchCV(pipeline,cv=5,
                    refit=True,
                    error_score='raise',
                    param_grid=param_grid,n_jobs=8)
# Fit the pipeline to articles
pip_search.fit(input_array)
print("Best parameter (CV score=%0.3f):" % pip_search.best_score_)
print(pip_search.best_params_)
labels = pip_search.predict(input_array)

In [None]:

# Create a DataFrame aligning labels and titles: df
df_classes = pd.DataFrame({'label': labels, 'review': review_tfidf})

# Display df sorted by cluster label
print(df_classes.sort_values('label'))

Supervised classification by applying multible models then choose the one with the best result. 

Supervised Classification: 
Using Stars As LABEL: star>=3  ('good')  star<3 ('bad') 

In [None]:
m=(df_coffee_reviews['stars'] >=3.0)
df_coffee_reviews['Cat_review']=np.where(m,'good','bad')

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier, AdaBoostClassifier

In [None]:
review_counts

In [None]:
# Split data into training and test sets
X=review_counts
y=df_coffee_reviews['Cat_review']
X_train, X_test, y_train, y_test = train_test_split(
   X,y,test_size = 0.20, random_state = 2)
sample_weight=compute_sample_weight("balanced", y_train.values)

In [None]:
# Train a Multimoda Naive Bayes classifier
classifyer={"decisionTreeclassifier":DecisionTreeClassifier(),"logisticregression":LogisticRegression(max_iter=300),
"multinomialNB":MultinomialNB(),'RandomForestClassifier':RandomForestClassifier(n_estimators=300)
,'GradientBoostingClassifier':GradientBoostingClassifier(n_estimators=300)}
for clf,model in classifyer.items():
    clf = model.fit(X_train, y_train,sample_weight =sample_weight )
    # Predicting the Test set results, find accuracy
    y_pred = clf.predict(X_test)
    print('results of {}={}'.format(clf,sklearn.metrics.accuracy_score(y_test, y_pred)))
    print(confusion_matrix(y_test, y_pred))


In [None]:
seed=1
adaboostclassifier=AdaBoostClassifier(DecisionTreeClassifier(max_depth=5),n_estimators=300, random_state=seed)


In [None]:
scores = cross_val_score(adaboostclassifier, X, y, cv=5,n_jobs=-1)
scores.mean()   


In [None]:
clf=adaboostclassifier.fit(X_train, y_train,sample_weight =sample_weight)
y_pred = clf.predict(X_test)
print('results of {}={}'.format(clf,sklearn.metrics.accuracy_score(y_test, y_pred)))
print(confusion_matrix(y_test, y_pred))


In [None]:
cr = classification_report(y_test, y_pred)
print(cr)

Save the datafram as csv file for future use

In [None]:
df_coffee_reviews.to_csv('coffee_review.csv')