# DATA VISUALIZATION IN PYTHON

## CONTENTS:
1. Import libraries and data
2. Check and clean the data set
3. Rename columns
4. Merge and export data

### 1. Import libraries and data

In [4]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [7]:
# define path
path = r'C:\Users\susan\Documents\data analytics\Instacart Basket Analysis\02 Data\Original data'

In [10]:
# import ords_prods_merge
cust = pd.read_csv(os.path.join(path, 'customers.csv'))

### 2. Check and clean the data set

In [17]:
# Displaying the columns. 
cust.head(5)

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [23]:
# Drop the first and last name columns, since these are not relevant for this analysis (GDPR reasons as well maybe?). 
cust.drop(columns = ['First Name', 'Surnam'])

Unnamed: 0,user_id,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Female,Maryland,26,1/1/2017,1,married,40374
...,...,...,...,...,...,...,...,...
206204,168073,Female,North Carolina,44,4/1/2020,1,married,148828
206205,49635,Male,Hawaii,62,4/1/2020,3,married,168639
206206,135902,Female,Missouri,66,4/1/2020,2,married,53374
206207,81095,Female,California,27,4/1/2020,1,married,99799


In [28]:
# Actually dropping the columns and creating a new "copy" called custs not to alterate the original file
custs = cust.drop(columns = ['First Name', 'Surnam'])

In [30]:
# Check if columns dropped
custs.head(5)

Unnamed: 0,user_id,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Female,Maryland,26,1/1/2017,1,married,40374


In [42]:
# Checking if missing values in the user id (key field) column, since this is probably the column that I will merge both files with. All entries are there though, 206209.
custs['user_id'].value_counts(dropna = False)

user_id
26711     1
67322     1
173044    1
61044     1
98344     1
         ..
146847    1
154991    1
172193    1
184326    1
80148     1
Name: count, Length: 206209, dtype: int64

In [46]:
# Checking if missing values in the income column. It seems like there are missing values, 108012 entries (compared with 206209).
custs['income'].value_counts(dropna = False)

income
57192     10
95891     10
95710     10
97532      9
98675      9
          ..
73141      1
71524      1
74408      1
44780      1
148828     1
Name: count, Length: 108012, dtype: int64

### 3. Rename columns

In [56]:
# Rename gender column. Remove capital letters and keep all small.  
custs.rename(columns = {'Gender' : 'gender'}, inplace = True)

In [58]:
# Rename state column 
custs.rename(columns = {'STATE' : 'state'}, inplace = True)

In [60]:
# Rename age column 
custs.rename(columns = {'Age' : 'age'}, inplace = True)

In [66]:
# Check the statistics of the dataframe custs. It seems like all entries are there, also in the income column. Compare with above. 
custs.describe()

Unnamed: 0,user_id,age,n_dependants,income
count,206209.0,206209.0,206209.0,206209.0
mean,103105.0,49.501646,1.499823,94632.852548
std,59527.555167,18.480962,1.118433,42473.786988
min,1.0,18.0,0.0,25903.0
25%,51553.0,33.0,0.0,59874.0
50%,103105.0,49.0,1.0,93547.0
75%,154657.0,66.0,3.0,124244.0
max,206209.0,81.0,3.0,593901.0


In [90]:
# Check for mixed types. There do not seem to be any since nothing is printed.
for col in custs.columns:
    types = custs[col].map(type)
    if len(types.unique()) > 1:
        print(f"Column '{col}' has mixed types:")
        print(types.value_counts())
        print(custs.loc[types != types.iloc[0], col])


In [92]:
# Looking for null values. No missing values can be found here. 
custs.isnull().sum()

user_id         0
gender          0
state           0
age             0
date_joined     0
n_dependants    0
fam_status      0
income          0
dtype: int64

In [98]:
# Check the shape of the file. 206209 rows, 8 columns. Good to use if I actually alter data, but here there were no missing values, nothing to drop etc.
custs.shape

(206209, 8)

In [100]:
# Look for full duplicates.
custs_dups = custs[custs.duplicated()]

In [104]:
# Check if there were any duplicates. There were none.
custs_dups

Unnamed: 0,user_id,gender,state,age,date_joined,n_dependants,fam_status,income


In [140]:
# define path for importing another file
path = r'C:\Users\susan\Documents\data analytics\Instacart Basket Analysis\02 Data\Prepared data'

In [142]:
# import ords_prods_merge
ords_prods = pd.read_pickle(os.path.join(path, 'ords_prods_newest.pkl'))

In [None]:
# Check the shape, rows, columns.
ords_prods.shape

In [146]:
# Check the output
ords_prods.head(2)


Unnamed: 0,order_id,user_id,validation,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,...,prices,price_range_loc,busiest day,busiest_hour,max_order,loyalty_flag,average_price,spending_flag,avg_price,frequency_flag
0,2539329,1,prior,1,2,8,,196,1,0,...,9.0,Mid-range product,Regularly busy,Most orders,10,New customer,6.367797,High spender,5.813559,
1,2539329,1,prior,1,2,8,,14084,2,0,...,12.5,Mid-range product,Regularly busy,Most orders,10,New customer,6.367797,High spender,5.813559,


In [148]:
custs.shape

(206209, 8)

### 4. Merge and export the data

In [157]:
# Merge custs with ords_prods_merge
ords_prods_custs = custs.merge(ords_prods, on = 'user_id', indicator = False)

In [161]:
# Check if it was merged
ords_prods_custs.head(2)

Unnamed: 0,user_id,gender,state,age,date_joined,n_dependants,fam_status,income,order_id,validation,...,prices,price_range_loc,busiest day,busiest_hour,max_order,loyalty_flag,average_price,spending_flag,avg_price,frequency_flag
0,1,Female,Alabama,31,2/17/2019,3,married,40423,2539329,prior,...,9.0,Mid-range product,Regularly busy,Most orders,10,New customer,6.367797,High spender,5.813559,
1,1,Female,Alabama,31,2/17/2019,3,married,40423,2539329,prior,...,12.5,Mid-range product,Regularly busy,Most orders,10,New customer,6.367797,High spender,5.813559,


In [164]:
# Export data to .pickle 
ords_prods_custs.to_pickle(os.path.join(path, 'ords_prods_custs.pkl'))