<a href="https://colab.research.google.com/github/yiyukk/DeloitteProject/blob/main/customer_economics_cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
from google.colab import drive


## Data Cleaning - Customer_economics

#### Reading Data

In [None]:
# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Define the base path and data path
base_path = '/content/drive/Shared drives/Deloitte'
data_path = base_path + '/Original Data'

# Load the Excel file
customer_economics = pd.read_excel(data_path + '/customer_economics.xlsx')

# Display the first few rows of the dataframe
customer_economics.head()

Mounted at /content/drive


Unnamed: 0,Rating,Number.of.Reviews,ID,Sales.Amount,Number.of.Employees
0,3.101,58,1.0,29759.508604,33.0
1,4.14,13,2.0,38363.039529,29.0
2,3.916,22,3.0,26352.77712,36.0
3,3.803,33,4.0,27910.264616,26.0
4,3.958,47,5.0,33478.441029,31.0


#### Data Exploration

In [None]:
#checking the shape of the dataset

customer_economics.shape

(9662, 5)

In [None]:
#checking for duplicates
customer_economics.duplicated().sum()

0

In [None]:
#checking for the columns names
customer_economics.columns

Index(['Rating', 'Number.of.Reviews', 'ID', 'Sales.Amount',
       'Number.of.Employees'],
      dtype='object')

In [None]:
#Rename column names
customer_economics.columns = customer_economics.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('.', '_')
print(customer_economics.columns)

Index(['rating', 'number_of_reviews', 'id', 'sales_amount',
       'number_of_employees'],
      dtype='object')


In [None]:
#checking for data types
customer_economics.dtypes

rating                  object
number_of_reviews        int64
id                     float64
sales_amount           float64
number_of_employees    float64
dtype: object

In [None]:
#check for null values
customer_economics.isna().sum()

rating                 2
number_of_reviews      0
id                     3
sales_amount           1
number_of_employees    2
dtype: int64

Now we are going through each variable to do the necessary changes for our analysis.

#### ID

In [None]:
#We found some duplicate ID numbers, and this cannot occur so we have to delete as we cannot assume which one is the correct one
duplicated_id = customer_economics[customer_economics.duplicated('id', keep=False)]
duplicated_id

#We see a lot of doubles that also contain NA but further exact same information, so we delete the NA row
duplicated_id.sort_values(by='id', ascending=False)


Unnamed: 0,rating,number_of_reviews,id,sales_amount,number_of_employees
9439,3.298,24,9769.0,26472.409942,37.0
9645,3.298,24,9769.0,,37.0
9402,3.958,15,9733.0,38993.982464,
9417,3.958,15,9733.0,38993.982464,33.0
745,4.701,19,763.0,31629.197586,29.0
9661,4.092,24,763.0,27336.162153,31.0
4171,5.0,79,,31361.859212,34.0
4172,4.62,58,,43521.109613,32.0
4173,4.06,28,,32249.591869,31.0


In [None]:
# Need to drop the NaN in ID
customer_economics[customer_economics['id'].isna()]
customer_economics.dropna(subset=['id'], inplace=True)

In [None]:
# Dropping the NaN rows of the duplicates
customer_economics.dropna(subset=['sales_amount'], inplace=True)
customer_economics.dropna(subset=['number_of_employees'], inplace=True)

In [None]:
duplicated = customer_economics[customer_economics.duplicated('id', keep=False)]
duplicated.sort_values(by='id', ascending=False)

Unnamed: 0,rating,number_of_reviews,id,sales_amount,number_of_employees
745,4.701,19,763.0,31629.197586,29.0
9661,4.092,24,763.0,27336.162153,31.0


In [None]:
#Need to drop 763 completely as both rows do not align and we cannot assume that only one of them is correct
customer_economics = customer_economics.drop(customer_economics[customer_economics['id'] == 763].index)

In [None]:
#For consistency we change the ID -1 to 0
customer_economics['id'].replace(-1, 0, inplace=True)

In [None]:
customer_economics[customer_economics['id'].isin([0, -1])]

Unnamed: 0,rating,number_of_reviews,id,sales_amount,number_of_employees
6382,4.584,32,0.0,28831.069355,29.0


#### Rating - average rating of the customer (assuming 0 - 5)

In [None]:
#Ensure the 'rating' column is treated as string first
customer_economics['rating'] = customer_economics['rating'].astype(str)

#Replace commas with points and then convert to float
customer_economics['rating'] = customer_economics['rating'].str.replace(',', '.').astype(float)

In [None]:
customer_economics.sort_values(by='rating', ascending=False).head()
# We assume data error for store id 1273, 2610, 5593, 2565 as rating is assumed from 0 - 5
# As we consider the average rating as an important factor of analyzing our best customer, we will delete error rows to make sure mistakes don't impact results

Unnamed: 0,rating,number_of_reviews,id,sales_amount,number_of_employees
1238,4146.0,112,1273.0,32146.581204,34.0
2536,9.005,28,2610.0,32279.149485,33.0
5437,7.342,27,5593.0,28147.333268,38.0
2492,5.143,27,2565.0,29256.396491,29.0
4830,5.0,48,4975.0,30123.464905,29.0


In [None]:
# Deleting rows with data entry errors for 'rating'
customer_economics = customer_economics[customer_economics['rating'] <= 5]

In [None]:
customer_economics.sort_values(by='rating', ascending=True).head()


Unnamed: 0,rating,number_of_reviews,id,sales_amount,number_of_employees
7124,-1.0,37,7358.0,32577.119283,31.0
870,-1.0,27,892.0,26863.302503,29.0
8286,0.0,18,8568.0,30278.13924,25.0
7803,2.023,7,8066.0,21769.174458,27.0
255,2.028,22,264.0,25414.658841,32.0


In [None]:
# For consistency, we delete the rows of ratings as -1 as this is data entry error and assuming that you cannot have a rating of 0 if there are reviews
customer_economics = customer_economics[customer_economics['rating'] > 0]

#### Number of reviews

In [None]:
customer_economics.sort_values(by='number_of_reviews', ascending=True).head(10)
# Assuming rating of zero or -1 is incorrect as rating > 0 and you need to leave a rating when giving a review
# We assume it is possible to have a rating without a review so 0 is possible

Unnamed: 0,rating,number_of_reviews,id,sales_amount,number_of_employees
6814,3.85,-1,7036.0,30578.188339,31.0
1398,4.45,0,1441.0,29410.050233,31.0
4152,2.824,0,4280.0,31194.111539,31.0
5873,3.202,4,6046.0,20751.32691,26.0
1232,2.25,4,1267.0,20612.25203,32.0
8068,2.791,4,8340.0,33867.215294,24.0
5597,3.482,5,5757.0,25851.284549,34.0
2177,3.062,5,2239.0,34397.097104,37.0
6702,2.499,5,6920.0,24637.272708,29.0
8367,3.327,5,8652.0,22394.92775,32.0


In [None]:
# Replacing -1 with zero as having negative number of reviews is not possible
customer_economics['number_of_reviews'] = customer_economics['number_of_reviews'].replace(-1, 0)

In [None]:
#Checking for nan rows
nan_rows = customer_economics[customer_economics['number_of_reviews'].isna()]
nan_rows

Unnamed: 0,rating,number_of_reviews,id,sales_amount,number_of_employees


In [None]:
customer_economics.sort_values(by='number_of_reviews', ascending=False).head(10)
#2 outliers from id 8576 and 3283, as we don't think number of reviews is as important we don't want to delete the rows

Unnamed: 0,rating,number_of_reviews,id,sales_amount,number_of_employees
8294,5.0,12000,8576.0,33515.515029,29.0
3191,4.003,9000,3283.0,24142.94584,34.0
8447,5.0,313,8733.0,40339.252722,32.0
5260,4.308,295,5413.0,36215.816228,27.0
995,4.33,290,1022.0,35300.383066,33.0
4928,5.0,243,5074.0,35715.543211,29.0
6907,5.0,236,7133.0,47882.538726,29.0
2486,4.869,236,2559.0,35059.314316,33.0
7445,5.0,231,7690.0,38740.340185,29.0
3774,4.724,229,3887.0,34353.287265,33.0


In [None]:
fig = px.histogram(customer_economics, x='number_of_reviews', nbins=50, title='Distribution of Number of Reviews')

# Update the layout for better visualization
fig.update_layout(
    xaxis_title='Number of Reviews',
    yaxis_title='Frequency',
    title={'x':0.5}
)

fig.show()

In [None]:
# Step 1: Identify and replace values above 400 with NaN
customer_economics.loc[customer_economics['number_of_reviews'] > 400, 'number_of_reviews'] = np.nan

# Step 2: Calculate the average rating excluding NaN values
average_rating = customer_economics['number_of_reviews'].mean()

# Step 3: Impute NaN values with the calculated average
customer_economics['number_of_reviews'].fillna(average_rating, inplace=True)


fig_hist = px.histogram(customer_economics, x='number_of_reviews', nbins=50, title='Distribution of Number of Reviews', marginal="box", opacity=0.75)
fig_hist.update_layout(
    xaxis_title='Number of Reviews',
    yaxis_title='Frequency',
    title={'x': 0.5}
)
fig_hist.show()


#### Number of Employees

In [None]:
customer_economics.sort_values(by='number_of_employees', ascending=True)
#2 stores have number_of_employees as zero
#1 store has 1 employee, considering this an outlier
#Assuming this is wrong we put the average of the number_of_employees with their store size
#This will be done on the merged dataset as this gives us insights on the size of the store


Unnamed: 0,rating,number_of_reviews,id,sales_amount,number_of_employees
1955,4.307,35.0,2011.0,28356.028756,0.0
192,3.273,24.0,200.0,31652.224802,0.0
9656,4.966,60.0,9994.0,30884.234842,1.0
9411,4.168,82.0,9742.0,30993.009554,20.0
9405,3.686,16.0,9736.0,25324.728371,20.0
...,...,...,...,...,...
7848,3.263,12.0,8112.0,24488.127354,40.0
1143,4.061,37.0,1175.0,41657.558930,41.0
9310,2.771,10.0,9636.0,36815.066467,41.0
1320,4.200,18.0,1358.0,27159.683176,42.0


In [None]:
#Round down the number of employees to the nearest whole number
customer_economics['number_of_employees'] = customer_economics['number_of_employees'].round()


#### Sales Amount

In [None]:
customer_economics.sort_values(by='sales_amount', ascending=True)


Unnamed: 0,rating,number_of_reviews,id,sales_amount,number_of_employees
3252,3.354,11.0,3350.0,-2.892586e+04,30.0
8841,4.024,28.0,9147.0,-1.000000e+00,30.0
1232,2.250,4.0,1267.0,2.061225e+04,32.0
5873,3.202,4.0,6046.0,2.075133e+04,26.0
4068,2.785,5.0,4194.0,2.173442e+04,33.0
...,...,...,...,...,...
316,4.249,222.0,325.0,4.803903e+04,29.0
2164,5.000,170.0,2226.0,4.805666e+04,27.0
2613,5.000,210.0,2688.0,4.881321e+04,28.0
5613,3.482,56.0,5773.0,3.014756e+08,34.0


In [None]:
#In the column 'sales_amount'we have two negative number, we will drop these rows
customer_economics.drop(customer_economics[customer_economics['sales_amount'] < 0].index, inplace=True)

In [None]:
customer_economics.sort_values(by='sales_amount', ascending=False)
#looking at the further details of ID = 3659 and ID = 5773 such as date_opening, store_size and number_of_employees, it looks
# impossible to have that huge sales_amount so we would delete so we consider them huge outliers

Unnamed: 0,rating,number_of_reviews,id,sales_amount,number_of_employees
3554,3.170,18.0,3659.0,3.009795e+13,27.0
5613,3.482,56.0,5773.0,3.014756e+08,34.0
2613,5.000,210.0,2688.0,4.881321e+04,28.0
2164,5.000,170.0,2226.0,4.805666e+04,27.0
316,4.249,222.0,325.0,4.803903e+04,29.0
...,...,...,...,...,...
5346,2.139,8.0,5500.0,2.183212e+04,32.0
7803,2.023,7.0,8066.0,2.176917e+04,27.0
4068,2.785,5.0,4194.0,2.173442e+04,33.0
5873,3.202,4.0,6046.0,2.075133e+04,26.0


In [None]:
customer_economics = customer_economics[customer_economics['sales_amount'] != 3659]
customer_economics = customer_economics[customer_economics['sales_amount'] != 5773]

In [None]:
customer_economics.dtypes

rating                 float64
number_of_reviews      float64
id                     float64
sales_amount           float64
number_of_employees    float64
dtype: object

In [None]:
customer_economics['id'] = customer_economics['id'].astype(str)

In [None]:
customer_economics.dtypes

rating                 float64
number_of_reviews      float64
id                      object
sales_amount           float64
number_of_employees    float64
dtype: object

### Save the dataset to a new version containing the cleaned data set with all the columns

In [None]:
customer_economics.to_csv("customer_economics_V2.csv", index=False)

#### Dropping all the columns that are not relevant

In [None]:
#no columns to be dropped

In [None]:
customer_economics_v2 = pd.read_csv('customer_economics_V2.csv')

customer_economics_v2.head()

Unnamed: 0,rating,number_of_reviews,id,sales_amount,number_of_employees
0,3.101,58.0,1.0,29759.508604,33.0
1,4.14,13.0,2.0,38363.039529,29.0
2,3.916,22.0,3.0,26352.77712,36.0
3,3.803,33.0,4.0,27910.264616,26.0
4,3.958,47.0,5.0,33478.441029,31.0


## Save as a clean data set

In [None]:
output_dir = '/content/drive/Shared drives/Deloitte/Cleaned Data'

customer_economics_v2.to_csv(output_dir + "/customer_economics_cleaned.csv", index=False)
