In [1]:
import findspark
findspark.init()

In [2]:
# import libraries
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from datetime import datetime
from pyspark.sql.functions import mean, stddev, col, log
from pyspark.sql.functions import to_date, dayofweek, to_timestamp
from pyspark.sql import types 
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DateType
from pyspark.sql.functions import year, month
from pyspark.sql.functions import dayofmonth, weekofyear
from pyspark.sql.functions import split, explode
from pyspark.sql.functions import coalesce, first, lit
from pyspark.ml.feature import Binarizer
from pyspark.ml.feature import Bucketizer
from pyspark.ml.feature import OneHotEncoder, StringIndexer, OneHotEncoderEstimator
from pyspark.sql.functions import regexp_extract, col
from pyspark.sql.functions import datediff
from pyspark.sql.functions import when

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import LinearRegressionModel
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import GBTClassifier, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.functions import *
from pyspark.sql.types import *
import pandas as pd

In [3]:
spark = SparkSession.builder.appName('review').getOrCreate()

In [4]:
xls = pd.ExcelFile('Womens_Clothing_E_Commerce_Reviews.xlsx')
data = pd.read_excel(xls, 'Reviews',  index_col=0)

In [5]:
data

Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses
...,...,...,...,...,...,...,...,...,...,...
23476,1104,34,Great dress for many occasions,I was very happy to snag this dress at such a ...,5,1,0,General Petite,Dresses,Dresses
23477,862,48,Wish it was made of cotton,"It reminds me of maternity clothes. soft, stre...",3,1,0,General Petite,Tops,Knits
23478,1104,31,"Cute, but see through","This fit well, but the top was very see throug...",3,0,1,General Petite,Dresses,Dresses
23479,1084,28,"Very cute dress, perfect for summer parties an...",I bought this dress for a wedding i have this ...,3,1,2,General,Dresses,Dresses


In [6]:
from pyspark.sql.types import *
mySchema = StructType([ StructField("Clothing ID", IntegerType(), True)\

                       ,StructField("Age", IntegerType(), True)\

                       ,StructField("Title", StringType(), True)\

                       ,StructField("Review Text", StringType(), True)\

                       ,StructField("Rating", IntegerType(), True)\

                       ,StructField("Recommended IND", IntegerType(), True)\

                       ,StructField("Positive Feedback Count", IntegerType(), True)\
                       
                       ,StructField("Division Name", StringType(), True)\

                       ,StructField("Department Name", StringType(), True)\

                       ,StructField("Class Name", StringType(), True)
])

In [7]:
df = spark.createDataFrame(data,schema=mySchema)

In [8]:
df.show(5)

+-----------+---+--------------------+--------------------+------+---------------+-----------------------+--------------+---------------+----------+
|Clothing ID|Age|               Title|         Review Text|Rating|Recommended IND|Positive Feedback Count| Division Name|Department Name|Class Name|
+-----------+---+--------------------+--------------------+------+---------------+-----------------------+--------------+---------------+----------+
|        767| 33|                 NaN|Absolutely wonder...|     4|              1|                      0|     Initmates|       Intimate| Intimates|
|       1080| 34|                 NaN|Love this dress! ...|     5|              1|                      4|       General|        Dresses|   Dresses|
|       1077| 60|Some major design...|I had such high h...|     3|              0|                      0|       General|        Dresses|   Dresses|
|       1049| 50|    My favorite buy!|I love, love, lov...|     5|              1|                      0|

In [9]:
df.select([count(when(isnan(c), c)).alias(c) for c in df.columns]).toPandas().T

Unnamed: 0,0
Clothing ID,0
Age,0
Title,3810
Review Text,845
Rating,0
Recommended IND,0
Positive Feedback Count,0
Division Name,14
Department Name,14
Class Name,14


In [10]:
df.count()

23481

#### Remove Title feature

In [11]:
df = df.drop('Title')
df.select([count(when(isnan(c), c)).alias(c) for c in df.columns]).toPandas().T

Unnamed: 0,0
Clothing ID,0
Age,0
Review Text,845
Rating,0
Recommended IND,0
Positive Feedback Count,0
Division Name,14
Department Name,14
Class Name,14


In [12]:
df.crosstab('Rating', 'Division Name').show()

+--------------------+-------+--------------+---------+---+
|Rating_Division Name|General|General Petite|Initmates|NaN|
+--------------------+-------+--------------+---------+---+
|                   5|   7611|          4604|      901| 14|
|                   1|    490|           297|       54|  0|
|                   2|    959|           528|       77|  0|
|                   3|   1772|           943|      155|  0|
|                   4|   3013|          1748|      315|  0|
+--------------------+-------+--------------+---------+---+



In [13]:
df.crosstab('Rating', 'Department Name').show()

+----------------------+-------+-------+--------+-------+---+----+-----+
|Rating_Department Name|Bottoms|Dresses|Intimate|Jackets|NaN|Tops|Trend|
+----------------------+-------+-------+--------+-------+---+----+-----+
|                     5|   2273|   3397|    1033|    631| 14|5730|   52|
|                     1|    115|    227|      63|     49|  0| 376|   11|
|                     2|    205|    461|      90|     60|  0| 737|   11|
|                     3|    421|    837|     178|     91|  0|1324|   19|
|                     4|    785|   1395|     371|    201|  0|2298|   26|
+----------------------+-------+-------+--------+-------+---+----+-----+



In [14]:
df.crosstab('Class Name', 'Rating').show()

+-----------------+---+---+---+----+----+
|Class Name_Rating|  1|  2|  3|   4|   5|
+-----------------+---+---+---+----+----+
|         Chemises|  0|  0|  0|   1|   0|
|            Pants| 42| 82|157| 291| 816|
|            Trend| 11| 11| 19|  26|  52|
|              NaN|  0|  0|  0|   0|  14|
|           Lounge| 23| 25| 78| 160| 405|
|       Fine gauge| 30| 75|119| 230| 646|
|        Outerwear| 14| 22| 29|  83| 180|
|   Casual bottoms|  0|  0|  0|   1|   1|
|         Layering|  3|  8|  9|  37|  89|
|         Sweaters| 49|105|195| 268| 810|
|            Jeans| 30| 55|115| 218| 729|
|            Sleep| 10| 15| 17|  44| 142|
|          Jackets| 35| 38| 62| 118| 451|
|          Legwear| 10|  6| 16|  29| 104|
|             Swim| 10| 28| 42|  73| 197|
|          Blouses|119|229|394| 669|1686|
|           Skirts| 34| 51|108| 208| 544|
|        Intimates|  7|  8| 16|  27|  96|
|           Shorts|  9| 17| 41|  67| 183|
|          Dresses|227|461|837|1395|3397|
+-----------------+---+---+---+---

#####  Division Name, Department Name, Class Name, Review Text help classify ratings => dropna instead of remove column


In [15]:
def to_null(c):
    return when(~(col(c).isNull() | isnan(col(c)) | (trim(col(c)) == "")), col(c))
df = df.select([to_null(c).alias(c) for c in df.columns]).na.drop()
df.select([count(when(isnan(c), c)).alias(c) for c in df.columns]).toPandas().T

Unnamed: 0,0
Clothing ID,0
Age,0
Review Text,0
Rating,0
Recommended IND,0
Positive Feedback Count,0
Division Name,0
Department Name,0
Class Name,0


In [16]:
df.groupby('Rating').agg({'Clothing ID': 'mean', 'Age': 'mean', 'Recommended IND': 'mean', 'Positive Feedback Count': 'mean'}).show()

+------+--------------------+----------------------------+-----------------+------------------+
|Rating|avg(Recommended IND)|avg(Positive Feedback Count)| avg(Clothing ID)|          avg(Age)|
+------+--------------------+----------------------------+-----------------+------------------+
|     1|0.018292682926829267|           3.551219512195122|917.0341463414634| 43.72560975609756|
|     3|  0.4145995747696669|          3.1948972360028347|928.2505315379163| 42.15946137491141|
|     5|  0.9981638192559477|          2.4102666453776145|915.8082388631647|43.697189845122146|
|     4|  0.9665783574485429|           2.488485836560016|923.0034644385572|43.007540248624416|
|     2|0.060723514211886306|           3.356589147286822|926.3365633074935| 42.61434108527132|
+------+--------------------+----------------------------+-----------------+------------------+



##### keep 2 columns Recommended IND, Positive Feedback Count, drop Clothing ID and Age

In [17]:
df = df.drop('Age', 'Clothing ID')
df.show()

+--------------------+------+---------------+-----------------------+--------------+---------------+----------+
|         Review Text|Rating|Recommended IND|Positive Feedback Count| Division Name|Department Name|Class Name|
+--------------------+------+---------------+-----------------------+--------------+---------------+----------+
|Absolutely wonder...|     4|              1|                      0|     Initmates|       Intimate| Intimates|
|Love this dress! ...|     5|              1|                      4|       General|        Dresses|   Dresses|
|I had such high h...|     3|              0|                      0|       General|        Dresses|   Dresses|
|I love, love, lov...|     5|              1|                      0|General Petite|        Bottoms|     Pants|
|This shirt is ver...|     5|              1|                      6|       General|           Tops|   Blouses|
|I love tracy rees...|     2|              0|                      4|       General|        Dresses|   D

In [18]:
df = df.withColumn('length', length(df['Review Text']))
df.groupby('Rating').mean('length').show()

+------+------------------+
|Rating|       avg(length)|
+------+------------------+
|     1|304.01951219512193|
|     3|327.42700212615165|
|     5|298.29402842088456|
|     4| 322.3945384145099|
|     2| 318.6052971576227|
+------+------------------+



##### drop length

In [19]:
df = df.drop('length')

In [20]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from pyspark.ml.feature import CountVectorizer, IDF, StringIndexer

In [21]:
token = Tokenizer(inputCol = 'Review Text', outputCol = 'token_text')
swremove = StopWordsRemover(inputCol = 'token_text', outputCol = 'stop_tokens')
count_vec = CountVectorizer(inputCol = 'stop_tokens', outputCol = 'c_vec')
idf = IDF(inputCol = 'c_vec', outputCol = 'tf_idf')
idx1 = StringIndexer(inputCol = 'Rating', outputCol = 'label')
idx2 = StringIndexer(inputCol = 'Recommended IND', outputCol = 'rec_id')
idx3 = StringIndexer(inputCol = 'Division Name', outputCol = 'div_id')
idx4 = StringIndexer(inputCol = 'Department Name', outputCol = 'dep_id')
idx5 = StringIndexer(inputCol = 'Class Name', outputCol = 'class_id')

In [22]:
clean = VectorAssembler(inputCols = ['tf_idf', 'rec_id', 'Positive Feedback Count', 'div_id', 'dep_id', 'class_id'], outputCol = 'features')

In [23]:
pipe = Pipeline(stages = [idx1, idx2, idx3, idx4, idx5, token, swremove, count_vec, idf, clean])

In [24]:
cleaner = pipe.fit(df)

In [25]:
final = cleaner.transform(df)

In [26]:
final = final.select(['features', 'label'])

In [27]:
final.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(37546,[22,51,139...|  1.0|
|(37546,[0,1,11,12...|  0.0|
|(37546,[2,6,7,9,1...|  2.0|
|(37546,[0,5,8,42,...|  0.0|
|(37546,[0,5,23,36...|  0.0|
+--------------------+-----+
only showing top 5 rows



In [28]:
from pyspark.ml.classification import NaiveBayes

In [29]:
log = LogisticRegression(labelCol = 'label', featuresCol = 'features')
nb = NaiveBayes(labelCol = 'label', featuresCol = 'features')
dtc = DecisionTreeClassifier(labelCol = 'label', featuresCol = 'features')
rfc = RandomForestClassifier(labelCol = 'label', featuresCol = 'features')

In [30]:
train_data, test_data = final.randomSplit([0.7, 0.3])

In [31]:
logmodel = log.fit(train_data)
nbmodel = nb.fit(train_data)
dtcmodel = dtc.fit(train_data)
rfcmodel = rfc.fit(train_data)

In [32]:
logres = logmodel.transform(test_data)
nbres = nbmodel.transform(test_data)
dtcres = dtcmodel.transform(test_data)
rfcres = rfcmodel.transform(test_data)

In [33]:
acc = MulticlassClassificationEvaluator()

In [34]:
acc.evaluate(logres)

0.5484864423178412

In [35]:
acc.evaluate(nbres)

0.553694205257784

In [36]:
acc.evaluate(dtcres)

0.5403386495694973

In [37]:
acc.evaluate(rfcres)

0.3915109839439266

##### => choose Model NaiveBayes 

In [38]:
xls = pd.ExcelFile('Womens_Clothing_E_Commerce_Reviews.xlsx')
data = pd.read_excel(xls, 'new_reviews',  index_col=0)
data

Unnamed: 0,Clothing ID,Age,Title,Review Text,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,1077,53,Dress looks like it's made of cheap material,Dress runs small esp where the zipper area run...,0,14,General,Dresses,Dresses
1,862,66,Cute top,Nice top. armholes are a bit oversized but as ...,1,2,General,Tops,Knits
2,1080,31,Underwhelmed,Was really excited for this dress but should h...,0,1,General,Dresses,Dresses
3,936,35,Absolutely perfect,If you are going for a ridiculously high price...,0,9,General,Tops,Sweaters
4,872,35,Cute comfy casual,I saw this online and immediately purchased th...,1,0,General,Tops,Knits


In [39]:
mySchema = StructType([ StructField("Clothing ID", IntegerType(), True)\

                       ,StructField("Age", IntegerType(), True)\

                       ,StructField("Title", StringType(), True)\

                       ,StructField("Review Text", StringType(), True)\

                       ,StructField("Recommended IND", IntegerType(), True)\

                       ,StructField("Positive Feedback Count", IntegerType(), True)\
                       
                       ,StructField("Division Name", StringType(), True)\

                       ,StructField("Department Name", StringType(), True)\

                       ,StructField("Class Name", StringType(), True)
])
df_new = spark.createDataFrame(data,schema=mySchema)
df_new.show()

+-----------+---+--------------------+--------------------+---------------+-----------------------+-------------+---------------+----------+
|Clothing ID|Age|               Title|         Review Text|Recommended IND|Positive Feedback Count|Division Name|Department Name|Class Name|
+-----------+---+--------------------+--------------------+---------------+-----------------------+-------------+---------------+----------+
|       1077| 53|Dress looks like ...|Dress runs small ...|              0|                     14|      General|        Dresses|   Dresses|
|        862| 66|            Cute top|Nice top. armhole...|              1|                      2|      General|           Tops|     Knits|
|       1080| 31|        Underwhelmed|Was really excite...|              0|                      1|      General|        Dresses|   Dresses|
|        936| 35|  Absolutely perfect|If you are going ...|              0|                      9|      General|           Tops|  Sweaters|
|        872|

In [40]:
pipe = Pipeline(stages = [idx2, idx3, idx4, idx5, token, swremove, count_vec, idf, clean])
cleaner_new = pipe.fit(df_new)

In [41]:
final_new = cleaner_new.transform(df_new)

In [42]:
final_new = final_new.select(['features'])
final_new.show()

+--------------------+
|            features|
+--------------------+
|(152,[0,1,2,3,4,5...|
|(152,[7,10,12,13,...|
|(152,[0,2,3,5,8,9...|
|(152,[1,4,8,11,12...|
|(152,[6,9,20,24,2...|
+--------------------+



In [43]:
results = nbmodel.transform(final_new)