# 00. Table of contents
 - Importing libraries
 - Importing Dataset
 - Data Wrangling for new df: customers
 - Data consistency checks for new df: customers
 - Exporting dfs as a preparation for merge in a separate notebook

# 01. Importing libraries

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

# 02. Importing dataset

In [2]:
path = r'C:\Users\viki\Documents\Data Analytics\Immersion\Achievement 4\Instacart Basket Analysis'

In [3]:
df_customers= pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'customers.csv'))

In [4]:
ords_prods_merge= pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_merged_4_9.pkl'))

# 03. Data Wrangling (Step 4)

### Exploring the new df

In [5]:
df_customers.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [6]:
df_customers.describe()

Unnamed: 0,user_id,Age,n_dependants,income
count,206209.0,206209.0,206209.0,206209.0
mean,103105.0,49.501646,1.499823,94632.852548
std,59527.555167,18.480962,1.118433,42473.786988
min,1.0,18.0,0.0,25903.0
25%,51553.0,33.0,0.0,59874.0
50%,103105.0,49.0,1.0,93547.0
75%,154657.0,66.0,3.0,124244.0
max,206209.0,81.0,3.0,593901.0


In [7]:
df_customers.shape

(206209, 10)

In [8]:
df_customers.dtypes

user_id          int64
First Name      object
Surnam          object
Gender          object
STATE           object
Age              int64
date_joined     object
n_dependants     int64
fam_status      object
income           int64
dtype: object

### Checking frequencies

In [9]:
df_customers['Gender'].value_counts()

Male      104067
Female    102142
Name: Gender, dtype: int64

In [10]:
df_customers['STATE'].value_counts().sort_index()

Alabama                 4044
Alaska                  4044
Arizona                 4044
Arkansas                4044
California              4044
Colorado                4044
Connecticut             4044
Delaware                4044
District of Columbia    4044
Florida                 4044
Georgia                 4044
Hawaii                  4044
Idaho                   4044
Illinois                4044
Indiana                 4044
Iowa                    4044
Kansas                  4043
Kentucky                4043
Louisiana               4043
Maine                   4043
Maryland                4043
Massachusetts           4043
Michigan                4043
Minnesota               4043
Mississippi             4043
Missouri                4043
Montana                 4043
Nebraska                4043
Nevada                  4043
New Hampshire           4043
New Jersey              4043
New Mexico              4043
New York                4043
North Carolina          4043
North Dakota  

In [11]:
df_customers['date_joined'].value_counts().sort_index()

1/1/2017     159
1/1/2018     147
1/1/2019     153
1/1/2020     153
1/10/2017    192
            ... 
9/8/2018     164
9/8/2019     158
9/9/2017     186
9/9/2018     174
9/9/2019     181
Name: date_joined, Length: 1187, dtype: int64

In [12]:
df_customers['n_dependants'].value_counts().sort_index()

0    51602
1    51531
2    51482
3    51594
Name: n_dependants, dtype: int64

In [13]:
df_customers['fam_status'].value_counts().sort_index()

divorced/widowed                     17640
living with parents and siblings      9701
married                             144906
single                               33962
Name: fam_status, dtype: int64

### Let's do some wrangling

In [14]:
# changing column names to be more intuitive
df_customers.rename(columns={'Surnam':'Surname'}, inplace= True)
df_customers.rename(columns={'STATE':'State'}, inplace= True)
df_customers.rename(columns={'income':'Income'}, inplace= True)

In [15]:
#checking renaming results
df_customers.head()

Unnamed: 0,user_id,First Name,Surname,Gender,State,Age,date_joined,n_dependants,fam_status,Income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [16]:
# changing user_id datatype to string
df_customers['user_id']=df_customers['user_id'].astype('str')

In [17]:
# checking the result of changing datatype
df_customers['user_id'].dtype

dtype('O')

# 04. Data quality and consistency (Step 5)

In [18]:
# looking for mixed type data

for col in df_customers.columns.tolist():
  weird = (df_customers[[col]].applymap(type) != df_customers[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_customers[weird]) > 0:
    print (col)

First Name


In [19]:
# changing dtype for column "first name"
df_customers['First Name']=df_customers['First Name'].astype('str')

In [20]:
# checking for missing values
df_customers.isnull().sum()

user_id         0
First Name      0
Surname         0
Gender          0
State           0
Age             0
date_joined     0
n_dependants    0
fam_status      0
Income          0
dtype: int64

<font color=blue>There were no missing values in our df</font> 


In [21]:
# looking for duplicates by creating a subset
df_dups= df_customers[df_customers.duplicated()]

In [22]:
df_dups.head()

Unnamed: 0,user_id,First Name,Surname,Gender,State,Age,date_joined,n_dependants,fam_status,Income


<font color=blue>There were no duplicates in our df</font> 

# 06. Merging Data (Step 6)

In [23]:
# checking our other df
ords_prods_merge.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,price_range_loc,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,avg_price_of_products_purchased,spending_flag,median_days_since_prior_order,frequency_flag
0,2539329,1,1,2,8,11.0,196,1,0,Soda,...,Mid-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.367797,Low Spender,20.0,Regular customer
1,2398795,1,2,3,7,15.0,196,1,1,Soda,...,Mid-range product,Regularly busy,Least busy days,Average orders,10,New customer,6.367797,Low Spender,20.0,Regular customer
2,473747,1,3,3,12,21.0,196,1,1,Soda,...,Mid-range product,Regularly busy,Least busy days,Most orders,10,New customer,6.367797,Low Spender,20.0,Regular customer
3,2254736,1,4,4,7,29.0,196,1,1,Soda,...,Mid-range product,Least busy,Least busy days,Average orders,10,New customer,6.367797,Low Spender,20.0,Regular customer
4,431534,1,5,4,15,28.0,196,1,1,Soda,...,Mid-range product,Least busy,Least busy days,Most orders,10,New customer,6.367797,Low Spender,20.0,Regular customer


In [24]:
#checking columns in our other df 
ords_prods_merge.columns

Index(['order_id', 'user_id', 'order_number', 'orders_day_of_week',
       'order_hour_of_day', 'days_since_prior_order', 'product_id',
       'add_to_cart_order', 'reordered', 'product_name', 'aisle_id',
       'department_id', 'prices', 'price_range_loc', 'busiest_day',
       'busiest_days', 'busiest_period_of_day', 'max_order', 'loyalty_flag',
       'avg_price_of_products_purchased', 'spending_flag',
       'median_days_since_prior_order', 'frequency_flag'],
      dtype='object')

In [25]:
ords_prods_merge.shape

(32404859, 23)

In [26]:
df_customers.shape

(206209, 10)

In [27]:
# we want to do a merge on user_id, double checking dtype in the ords_prods_merge df
ords_prods_merge['user_id'].dtype

dtype('int64')

In [28]:
# changing datatype to string
ords_prods_merge['user_id'] = ords_prods_merge['user_id'].astype('str')

In [29]:
# checking datatype after changing
ords_prods_merge['user_id'].dtype

dtype('O')

In [30]:
# exporting altered datasets - will try to merge them in a new notebook as here it failed due to memory issues no matter what I tried
ords_prods_merge.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_merged_for_final_merge.pkl'))
df_customers.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'customers_for_final_merge.pkl'))