### Importing libraries

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

### Importing data

In [2]:
cus = pd.read_csv(r"C:\Users\user\Desktop\Data Analytics\02-12-2023 Instacart Basket Analysis\02 Data\Original data\customers.csv")

### Contents
#### 1. Data Wrangling
#### 2. Merging customers df with orders and products data
#### 3. Exporting new dataframe

### 1. Data Wrangling

In [3]:
#Removing columns unnecessary for our analysis questions:
cus = cus.drop(columns = ['First Name', 'Surnam', 'Gender', 'date_joined'])

In [4]:
#Renaming n_dependants column to # of dependants
cus.rename(columns = {'n_dependants' : '# of dependants'}, inplace = True)

In [5]:
#Viewing output
cus

Unnamed: 0,user_id,STATE,Age,# of dependants,fam_status,income
0,26711,Missouri,48,3,married,165665
1,33890,New Mexico,36,0,single,59285
2,65803,Idaho,35,2,married,99568
3,125935,Iowa,40,0,single,42049
4,130797,Maryland,26,1,married,40374
...,...,...,...,...,...,...
206204,168073,North Carolina,44,1,married,148828
206205,49635,Hawaii,62,3,married,168639
206206,135902,Missouri,66,2,married,53374
206207,81095,California,27,1,married,99799


#### Checking for mixed type columns

In [6]:
for col in cus.columns.tolist():
  weird = (cus[[col]].applymap(type) != cus[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (cus[weird]) > 0:
    print (col)

#### Checking for missing data

In [7]:
# Counting NaN values in all columns
nan_count = cus.isna().sum()

print(nan_count)

user_id            0
STATE              0
Age                0
# of dependants    0
fam_status         0
income             0
dtype: int64


#### Checking for duplicates

In [8]:
#Checking for full duplicates
cus_dups = cus[cus.duplicated()]
print(cus_dups)

Empty DataFrame
Columns: [user_id, STATE, Age, # of dependants, fam_status, income]
Index: []


In [9]:
#Checking for duplicates in user_id column
us_dups = cus['user_id'].duplicated().sum()
print(us_dups)

0


### 2. Merging customers df with orders and products data

In [12]:
#Checking the data type 
cus.dtypes

user_id             int64
STATE              object
Age                 int64
# of dependants     int64
fam_status         object
income              int64
dtype: object

In [13]:
#Importing orders and products df 
ords_prods = pd.read_pickle (r"C:\Users\user\Desktop\Data Analytics\02-12-2023 Instacart Basket Analysis\02 Data\Prepared data\orders_products_combined3.pkl")

In [14]:
#Merging
df_merged = ords_prods.merge(cus, on = 'user_id')

In [15]:
#Viewing output
df_merged.head(20)

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge,...,loyalty_flag,avg_price,spending_habit,med_days,order_frequency,STATE,Age,# of dependants,fam_status,income
0,2539329,1,1,2,8,11.114836,196,1,0,both,...,New customer,6.367797,Low spender,20.0,Regular customer,Alabama,31,3,married,40423
1,2398795,1,2,3,7,15.0,196,1,1,both,...,New customer,6.367797,Low spender,20.0,Regular customer,Alabama,31,3,married,40423
2,473747,1,3,3,12,21.0,196,1,1,both,...,New customer,6.367797,Low spender,20.0,Regular customer,Alabama,31,3,married,40423
3,2254736,1,4,4,7,29.0,196,1,1,both,...,New customer,6.367797,Low spender,20.0,Regular customer,Alabama,31,3,married,40423
4,431534,1,5,4,15,28.0,196,1,1,both,...,New customer,6.367797,Low spender,20.0,Regular customer,Alabama,31,3,married,40423
5,3367565,1,6,2,7,19.0,196,1,1,both,...,New customer,6.367797,Low spender,20.0,Regular customer,Alabama,31,3,married,40423
6,550135,1,7,1,9,20.0,196,1,1,both,...,New customer,6.367797,Low spender,20.0,Regular customer,Alabama,31,3,married,40423
7,3108588,1,8,1,14,14.0,196,2,1,both,...,New customer,6.367797,Low spender,20.0,Regular customer,Alabama,31,3,married,40423
8,2295261,1,9,1,16,0.0,196,4,1,both,...,New customer,6.367797,Low spender,20.0,Regular customer,Alabama,31,3,married,40423
9,2550362,1,10,4,8,30.0,196,1,1,both,...,New customer,6.367797,Low spender,20.0,Regular customer,Alabama,31,3,married,40423


In [16]:
# Counting NaN values in all columns to see if merge was successful
nan_c = df_merged.isna().sum()

print(nan_c)

order_id                  0
user_id                   0
order_number              0
orders_day_of_week        0
hour_of_day               0
days_since_prior_order    0
product_id                0
add_to_cart_order         0
reordered                 0
_merge                    0
product_name              0
aisle_id                  0
department_id             0
prices                    0
price_range_loc           0
busiest_day               0
busiest_days              0
busiest_period_of_day     0
max_order                 0
loyalty_flag              0
avg_price                 0
spending_habit            0
med_days                  0
order_frequency           0
STATE                     0
Age                       0
# of dependants           0
fam_status                0
income                    0
dtype: int64


### 3. Exporting new dataframe

In [17]:
path = r"C:\Users\user\Desktop\Data Analytics\02-12-2023 Instacart Basket Analysis"

In [18]:
#Exporting data to pickle
df_merged.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_combined4.pkl'))