In [1]:
import numpy as np
import pandas as pd
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
########################### Data processing ###############################

In [4]:
raw_df = pd.read_csv('Data/Womens Clothing E-Commerce Reviews.csv')

In [5]:
raw_df.head(10)

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses
5,5,1080,49,Not for the very petite,"I love tracy reese dresses, but this one is no...",2,0,4,General,Dresses,Dresses
6,6,858,39,Cagrcoal shimmer fun,I aded this in my basket at hte last mintue to...,5,1,1,General Petite,Tops,Knits
7,7,858,39,"Shimmer, surprisingly goes with lots","I ordered this in carbon for store pick up, an...",4,1,4,General Petite,Tops,Knits
8,8,1077,24,Flattering,I love this dress. i usually get an xs but it ...,5,1,0,General,Dresses,Dresses
9,9,1077,34,Such a fun dress!,"I'm 5""5' and 125 lbs. i ordered the s petite t...",5,1,0,General,Dresses,Dresses


In [6]:
# keeping only the columns needed

reviews_df = raw_df[['Clothing ID', 'Review Text', 'Rating']]

In [7]:
# ensuring that all variables have the right data type

reviews_df.dtypes

Clothing ID     int64
Review Text    object
Rating          int64
dtype: object

In [8]:
# rename the variables for convenience

reviews_df = reviews_df.rename(
    columns = {
        'Clothing ID': 'Clothing_ID',
        'Review Text': 'Review_Text'
    }
)

In [9]:
reviews_df = reviews_df.dropna()

In [10]:
# get the indices corresponding to the first 5 and last 5 rows

indices = np.hstack(
    (np.arange(5), np.arange(-5, 0))

)

indices

array([ 0,  1,  2,  3,  4, -5, -4, -3, -2, -1])

In [11]:
# determining the top 5 and bottom 5 observations, based on the number of reviews left, in addition to the average product rating. 
# The analysis will focus only on those, but can naturally be extended to all the other observations

In [12]:
count_obs_clothing_ID_df = (
    reviews_df
    .groupby('Clothing_ID')
    # compute the count of observations per product (clothing ID)
    .size()
    # select the top 5 and bottom 5 observations accordingly
    .sort_values(ascending=False)
    .iloc[indices]
    # set the data back as a dataframe
    .reset_index(drop=False, name='count_obs')

)

count_obs_clothing_ID_df

Unnamed: 0,Clothing_ID,count_obs
0,1078,987
1,862,778
2,1094,735
3,1081,561
4,872,519
5,483,1
6,484,1
7,485,1
8,487,1
9,610,1


In [13]:
(count_obs_clothing_ID_df['count_obs']==1).all() # in this data we only have 1 review per product so using the count of observations (in fine purchases) is not meaningful

False

In [14]:
ratings_df = (
    reviews_df
    .groupby('Clothing_ID')
    # compute the average rating per product (clothing ID)
    ['Rating'].mean()
    # select the top 5 and bottom 5 observations accordingly
    .sort_values(ascending=False)
    .iloc[indices]
    # set the data back as a dataframe
    .reset_index(drop=False, name='avg_rating')

)

ratings_df

Unnamed: 0,Clothing_ID,avg_rating
0,1205,5.0
1,449,5.0
2,474,5.0
3,472,5.0
4,471,5.0
5,348,1.0
6,285,1.0
7,421,1.0
8,1176,1.0
9,669,1.0


In [15]:
#################### NLP - Analysis ###################################

In [16]:
# drop unneeded columns

reviews_df = reviews_df.drop(columns='Rating')

In [17]:
reviews_df = reviews_df.loc[
    
    # keep only clothes with a match in ratings_df
    reviews_df['Clothing_ID'].isin(
        ratings_df['Clothing_ID'].to_numpy()
    )
    
    
]

In [18]:
reviews_df = reviews_df.assign(
    
    processed_review = (
        reviews_df['Review_Text']
        # create a Doc object from each string (review)
        .apply(
            lambda input_str: nlp(input_str)
        )
    
    )


)

reviews_df

Unnamed: 0,Clothing_ID,Review_Text,processed_review
3108,285,It seemed to stay in place and did not require...,"(It, seemed, to, stay, in, place, and, did, no..."
8122,472,These are such beautiful cheerful shorts. the ...,"(These, are, such, beautiful, cheerful, shorts..."
8125,472,I typically wear boring shorts. you know - kha...,"(I, typically, wear, boring, shorts, ., you, k..."
8133,472,Love shorts that i can dress down w/ a chambra...,"(Love, shorts, that, i, can, dress, down, w/, ..."
8150,472,"First off, they are much prettier irl than in ...","(First, off, ,, they, are, much, prettier, irl..."
8151,472,These shorts are extremely cute! i usually hav...,"(These, shorts, are, extremely, cute, !, i, us..."
8782,348,Top is completely see through. i cant imagine ...,"(Top, is, completely, see, through, ., i, ca, ..."
9274,1176,I wore this suit for the first time yesterday ...,"(I, wore, this, suit, for, the, first, time, y..."
12315,471,Love the mara hoffman basketweave bikini botto...,"(Love, the, mara, hoffman, basketweave, bikini..."
13678,669,These are light and airy and pretty. they also...,"(These, are, light, and, airy, and, pretty, .,..."


In [19]:
def get_unique_adjectives(input_nlp):
    '''Returns the unique adjectives found in the customer reviews
    '''
    # Part-of-speech (POS) tag should correspond to an adjective
    
    return set(token.lemma_ for token in input_nlp if token.pos_=='ADJ')
      

In [20]:
# get the unique adjectives from each customer review

reviews_df = reviews_df.assign(
    
    processed_review = (
        reviews_df['processed_review']
        .apply(
            lambda input_nlp: get_unique_adjectives(input_nlp)
        )
    
    )
)

reviews_df

Unnamed: 0,Clothing_ID,Review_Text,processed_review
3108,285,It seemed to stay in place and did not require...,"{tight, comfortable, constant, small}"
8122,472,These are such beautiful cheerful shorts. the ...,"{such, beautiful, floral, dressy, cheerful, pe..."
8125,472,I typically wear boring shorts. you know - kha...,"{boring, true, short, perfect, tight, slim}"
8133,472,Love shorts that i can dress down w/ a chambra...,"{perfect, big, small, thin, chambray}"
8150,472,"First off, they are much prettier irl than in ...","{pretty, large, wrong}"
8151,472,These shorts are extremely cute! i usually hav...,"{good, floral, usual, loose, tight, cute, nice..."
8782,348,Top is completely see through. i cant imagine ...,"{practical, wet, cute, top}"
9274,1176,I wore this suit for the first time yesterday ...,"{first, black, white, inner, lovely}"
12315,471,Love the mara hoffman basketweave bikini botto...,"{good, red, classic, skimpy, full, 5'2, upper,..."
13678,669,These are light and airy and pretty. they also...,"{other, serious, light, thin, third, little}"


In [21]:
reviews_df = (
    
    reviews_df
    # group together all reviews (unique adjectives) related to the same 
    # product
    .groupby('Clothing_ID')
    ['processed_review']
    .apply(
        # get unique adjectives from all the sets linked to a clothing ID
        lambda input_series: 
        set(adjective for set_i in input_series for adjective in set_i)
    )
    .reset_index(drop=False)

)

In [22]:
# join datasets, in order to sort data to have first the top 5 group, then later the bottom 5 group based on the average product rating

reviews_ratings_df = pd.merge(
    
    reviews_df,
    
    ratings_df,
    
    how='inner',
    
    on='Clothing_ID'

)

reviews_ratings_df

Unnamed: 0,Clothing_ID,processed_review,avg_rating
0,285,"{tight, comfortable, constant, small}",1.0
1,348,"{top, wet, practical, cute}",1.0
2,421,"{wide, rough}",1.0
3,449,"{flattering, comfortable, cute}",5.0
4,471,"{good, red, classic, skimpy, full, 5'2, upper,...",5.0
5,472,"{true, short, wrong, big, loose, cute, chambra...",5.0
6,474,"{roomy, perfect, other, happy, large, substant...",5.0
7,669,"{other, serious, light, thin, third, little}",1.0
8,1176,"{first, black, white, inner, lovely}",1.0
9,1205,"{normal, regular, perfect, 12th, beige, great,...",5.0


In [23]:
reviews_ratings_df = reviews_ratings_df.sort_values(by='avg_rating', 
                                                    ascending=False,
                                                    ignore_index=True)

In [24]:
# drop unneeded columns

reviews_ratings_df = reviews_ratings_df.drop(columns='avg_rating')

In [25]:
pd.set_option('display.max_colwidth', None) # avoid truncating results

reviews_ratings_df

Unnamed: 0,Clothing_ID,processed_review
0,449,"{flattering, comfortable, cute}"
1,471,"{good, red, classic, skimpy, full, 5'2, upper, cheeky}"
2,472,"{true, short, wrong, big, loose, cute, chambray, similar, brown, usual, such, boring, floral, dressy, cheerful, small, good, perfect, pretty, thin, beautiful, large, tight, last, slim, nice, hard, friendly}"
3,474,"{roomy, perfect, other, happy, large, substantial, little, lovely}"
4,1205,"{normal, regular, perfect, 12th, beige, great, fine, second}"
5,285,"{tight, comfortable, constant, small}"
6,348,"{top, wet, practical, cute}"
7,421,"{wide, rough}"
8,669,"{other, serious, light, thin, third, little}"
9,1176,"{first, black, white, inner, lovely}"


In [26]:
raw_df.loc[(raw_df['Clothing ID'] == 474) | (raw_df['Clothing ID'] == 669)]

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
13678,13678,669,21,Pretty but cheap,"These are light and airy and pretty. they also tore the third time i wore them and not on a seam - a little above the knee while i was lying in my bed.\r\n\r\nthey also do not slim the figure and the fabric is slightly see-through as it is so thin, therefore if you are looking for serious lounge pants - great - i would not leave the house in them... or turn over in bed in them.\r\n\r\nwhen i sit, the slits fall down around my thighs. my son complimented my dress the other day because the slits give way",1,0,6,General Petite,Intimate,Lounge
17921,17921,474,62,,"Just received these this morning and so happy. cartonnier always makes a substantial quality product. might run a little large,depends how roomy you want them, lovely colors and perfect length. wish they came in other colors.",5,1,0,General,Bottoms,Shorts


- For the top 5 group:

- Overall, we tend to note certain attributes which clear attributes making each product successful can be seen (e.g. for product 474 which is a pair of shorts, the comfort aspect seems to take precedence.

    - Accordingly a line of products targeted toward similar customers may be formed (focusing on the comfort/feel good in one's own body), if such segment is large enough to be profitable
    - Other features may be leveraged e.g. new colors, to account for further needs (i.e. microsegmentation)
    
<br>
- For highly rated products with relatively only positive products, we may keep them as they are, and plan (add only for minor updates/variations given the seasons/trend (monitoring the sales volume + customer feedback over time)

-----------------------------

- For the bottom 5 group:

- We note again relatively clear reasons for why a product is poorly considered (e.g. product 669 is deemed too thin)

- Given the sales volume of such products and the need or not to keep such products strategically (e.g. barrage against competitors to protect from an attack which may expand to more successful products), we may drop those, as the cost of better serving customers (correcting problems) might not be offset by more sales/price (to be assessed using a scenario analysis for instance accounting for the demand for such changes).

- The reviews help to improve future products, to avoid repeating errors (i.e. customer orientation)

- Products which have many negative reviews should possibl be dropped as they may lead to other future dissatisfied customers

In [27]:
##########################################################

In [28]:
# fit an LDA Latent Dirichlet Analysis to extract topics - topic modeling
# and plotting a word cloud per topic allowing using to understand what the topic means

# see https://www.analyticsvidhya.com/blog/2018/10/mining-online-reviews-topic-modeling-lda/

In [29]:
def process_text(input_nlp):
    
    output = []
    
    for token in input_nlp:
        #if token.is_alpha: #alphanummeric characters only (letters)
        if token.pos_=='ADJ': # keep only adjectives
            #if not token.is_stop: #remove stop words e.g. 'the'
            output.append(token.lemma_)
    
    #keep unique adjectives
    output = set(output)            
    
    return ' '.join(output)
        

In [30]:
# if LDA ends up giving inconsistent results we 
#simply use below


# using POS tagging get all the adjectives in a list


#then we concatenate those lists per clothing ID

#and for the top 5 we interpret
