In [37]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, TruncatedSVD

# Data Description
The dataset is obtained from Kaggle (https://www.kaggle.com/nicapotato/womens-ecommerce-clothing-reviews)

### The following has been done in this notebook
1. Loading of the dataset
2. Some exploratory data analysis
3. Cleaning dataset eg.: removing NaN values

# Loading the dataset

In [38]:
df = pd.read_csv(r'C:\Users\veda.nair\Downloads\archive\Womens Clothing E-Commerce Reviews.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


# Exploratoty data analysis

In [39]:
# list of column names.
column_list = df.columns
column_list

Index(['Unnamed: 0', 'Clothing ID', 'Age', 'Title', 'Review Text', 'Rating',
       'Recommended IND', 'Positive Feedback Count', 'Division Name',
       'Department Name', 'Class Name'],
      dtype='object')

In [40]:
# contains 23486 rows and 11 columns.
df.shape

(23486, 11)

In [41]:
# taking out the 'Unnamed: 0' and 'Clothing ID' column as they are not useful for any analysis
df = df.drop(['Unnamed: 0', 'Clothing ID'], axis=1)

# clean the white space from the column names.
df = df.rename(columns=lambda x: x.replace(' ', ''))

In [43]:
#counting number of Na values for each column
df.isnull().sum(axis = 0)

Age                         0
Title                    3810
ReviewText                845
Rating                      0
RecommendedIND              0
PositiveFeedbackCount       0
DivisionName               14
DepartmentName             14
ClassName                  14
dtype: int64

### How many NAs we have?
- Those NAs in **DivisionName**, **DepartmentName** and **ClassName** are the same, since there are very few, will drop them.
- For **ReviewText**, since we are preforming our NLP on that column, we can't preform NLP if we don't have any text to analyze, so I will drop them.
- For **Title**, since it is text and I am doing NLP, eventually I probabily will need to prefrom NLP on that column and combine with the ReviewText column to see if there is any meaningful unsupervised learning results.My solution is to create a new column called CombineText, which comebine the Title and ReviewText column together and makes the Title like the first sentense of the review.


### Cleaning the NAs

In [44]:
# dorpping NAs for 4 columns.
subset = ['ReviewText', 'DivisionName', 'DepartmentName', 'ClassName']
df = df.dropna(subset=subset)

print('Now length of df is: ', len(df))

Now length of df is:  22628


In [45]:
# first fill NAs in the Title column with space, so I can concatenate the Title and ReviewText column together.
df.Title.fillna('', inplace=True)

# create a new column named CombinedText with Title and ReviewText.
df['CombinedText'] = df.Title + ' ' + df.ReviewText

# drop the Title column.
df.drop('Title', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.Title.fillna('', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['CombinedText'] = df.Title + ' ' + df.ReviewText
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop('Title', axis=1, inplace=True)


In [46]:
# general stats for the 4 numeric columns.
df.describe()

Unnamed: 0,Age,Rating,RecommendedIND,PositiveFeedbackCount
count,22628.0,22628.0,22628.0,22628.0
mean,43.28288,4.183092,0.818764,2.631784
std,12.328176,1.115911,0.385222,5.78752
min,18.0,1.0,0.0,0.0
25%,34.0,4.0,1.0,0.0
50%,41.0,5.0,1.0,1.0
75%,52.0,5.0,1.0,3.0
max,99.0,5.0,1.0,122.0


In [47]:
# there is no more NAs
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22628 entries, 0 to 23485
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Age                    22628 non-null  int64 
 1   ReviewText             22628 non-null  object
 2   Rating                 22628 non-null  int64 
 3   RecommendedIND         22628 non-null  int64 
 4   PositiveFeedbackCount  22628 non-null  int64 
 5   DivisionName           22628 non-null  object
 6   DepartmentName         22628 non-null  object
 7   ClassName              22628 non-null  object
 8   CombinedText           22628 non-null  object
dtypes: int64(4), object(5)
memory usage: 1.7+ MB


### Export as Pickle

In [48]:
df.to_pickle('cleaned_df.pkl')