# EDA + Category Mapping

In [37]:
import os
import random

import numpy as np
import pandas as pd
from scipy.stats import mode
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

### Prepare item data

In [38]:
items = pd.read_csv('original_data/item_data.csv')
items.sample(10)

Unnamed: 0,item_id,brand,brand_type,category
73403,73404,2164,Established,Alcohol
20993,20994,1124,Established,Grocery
9980,9981,1337,Established,Grocery
73090,73091,702,Established,Packaged Meat
48162,48163,22,Established,Pharmaceutical
59621,59622,914,Established,Grocery
48492,48493,4419,Established,Pharmaceutical
5984,5985,56,Local,Grocery
17213,17214,5108,Established,Pharmaceutical
58633,58634,1496,Established,Pharmaceutical


In [39]:
items.drop(['brand_type', 'brand'], axis=1, inplace=True)

In [40]:
items.category.value_counts()

Grocery                   32448
Pharmaceutical            24471
Natural Products           2533
Dairy, Juices & Snacks     2425
Skin & Hair Care           2244
Meat                       2080
Packaged Meat              1966
Prepared Food              1880
Bakery                     1679
Seafood                     728
Flowers & Plants            664
Miscellaneous               385
Alcohol                     303
Garden                      102
Restauarant                  78
Salads                       40
Travel                       19
Fuel                         14
Vegetables (cut)              7
Name: category, dtype: int64

In [41]:
items.isnull().sum()

item_id     0
category    0
dtype: int64

### Prepare customer demographics data

In [42]:
customers = pd.read_csv('original_data/customer_demographics.csv')
customers

Unnamed: 0,customer_id,age_range,marital_status,rented,family_size,no_of_children,income_bracket
0,1,70+,Married,0,2,,4
1,6,46-55,Married,0,2,,5
2,7,26-35,,0,3,1,3
3,8,26-35,,0,4,2,6
4,10,46-55,Single,0,1,,5
...,...,...,...,...,...,...,...
755,1577,36-45,,0,2,,5
756,1578,46-55,Married,0,3,1,6
757,1579,46-55,,0,1,,4
758,1580,26-35,,0,2,,5


In [43]:
customers['family_size'] = customers.family_size.apply(lambda x: int(x.replace('+', '')))
customers['no_of_children'] = customers.no_of_children.apply(lambda x: int(x.replace('+', '')) if pd.notna(x) else x)
customers.loc[pd.isnull(customers.marital_status) & (customers.family_size == 1),
              'marital_status'] = 'Single'
customers.loc[pd.isnull(customers.marital_status) & ((customers.family_size - customers.no_of_children) == 1),
              'marital_status'] = 'Single'
customers.loc[pd.isnull(customers.marital_status) & ((customers.family_size - customers.no_of_children) == 2),
              'marital_status'] = 'Married'
customers.loc[pd.isnull(customers.marital_status) & pd.isnull(customers.no_of_children) & (customers.family_size == 2),
              'marital_status'] = 'Married'
customers.loc[pd.isnull(customers.no_of_children) & (customers.marital_status == 'Married') & (customers.family_size == 2),
              'no_of_children'] = 0
customers.loc[pd.isnull(customers.no_of_children) & (customers.family_size == 1), 'no_of_children'] = 0
customers.loc[pd.isnull(customers.no_of_children) & (customers.family_size == 2), 'no_of_children'] = 1
customers['no_of_children'] = customers['no_of_children'].astype(np.int64)

In [44]:
customers.drop('rented', axis=1, inplace=True)

#### Fill in customer demographics data for customer ids absent from customers dataframe, but present in transactions dataframe

In [45]:
# Retrieve indices of such customers
transactions = pd.read_csv('original_data/customer_transaction_data.csv')
transactions.sample(5)

Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount
792310,2012-12-21,1284,9927,1,131.44,0.0,0.0
1300425,2013-06-25,205,17323,3,159.22,0.0,0.0
787320,2012-12-19,1239,22552,1,31.7,-3.56,0.0
213222,2012-05-24,1532,16323,2,142.48,-113.27,0.0
512259,2012-09-14,989,31226,4,213.72,-126.81,0.0


In [46]:
ids_to_add = sorted([i for i in transactions.customer_id.unique() if i not in customers.customer_id.values])

In [47]:
demographics = customers.drop('customer_id', axis=1).value_counts(normalize=True).rename('count_normalized').reset_index()

columns = demographics.drop('count_normalized', axis=1).columns
values = demographics.drop('count_normalized', axis=1).to_records()
probs = demographics['count_normalized'].values
row_num = len(ids_to_add)

new_customers = pd.DataFrame(np.random.choice(values, row_num, p=probs), columns=columns)
new_customers['customer_id'] = ids_to_add
new_customers

Unnamed: 0,age_range,marital_status,family_size,no_of_children,income_bracket,customer_id
0,46-55,Married,2,0,4,2
1,36-45,Married,2,0,6,3
2,46-55,Single,1,0,4,4
3,26-35,Single,1,0,5,5
4,18-25,Single,2,1,1,9
...,...,...,...,...,...,...
817,18-25,Single,1,0,1,1570
818,36-45,Married,2,0,7,1571
819,46-55,Single,1,0,3,1575
820,70+,Married,2,0,3,1576


In [48]:
customers = customers.append(new_customers).sort_values('customer_id').reset_index(drop=True)
customers

Unnamed: 0,customer_id,age_range,marital_status,family_size,no_of_children,income_bracket
0,1,70+,Married,2,0,4
1,2,46-55,Married,2,0,4
2,3,36-45,Married,2,0,6
3,4,46-55,Single,1,0,4
4,5,26-35,Single,1,0,5
...,...,...,...,...,...,...
1577,1578,46-55,Married,3,1,6
1578,1579,46-55,Single,1,0,4
1579,1580,26-35,Married,2,0,5
1580,1581,26-35,Married,3,1,1


### Prepare transaction data

In [49]:
transactions.sample(5)

Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount
230317,2012-05-31,112,33025,1,55.92,0.0,0.0
1094234,2013-04-09,1,19632,1,53.43,-24.58,0.0
789586,2012-12-20,1518,8443,1,35.62,-7.84,0.0
994731,2013-03-03,865,62238,1,106.5,-35.62,0.0
29890,2012-02-22,791,4934,1,248.98,-21.37,0.0


In [50]:
transactions.selling_price = transactions.selling_price/transactions.quantity
transactions.other_discount = transactions.other_discount/transactions.quantity
transactions.selling_price = transactions.selling_price + transactions.other_discount
transactions.drop(['other_discount'], axis=1, inplace=True)
transactions.sample(5)

Unnamed: 0,date,customer_id,item_id,quantity,selling_price,coupon_discount
649665,2012-11-02,791,56857,1,113.62,0.0
49852,2012-03-10,1448,32324,1,69.1,0.0
255386,2012-06-10,1558,29168,1,149.24,0.0
991257,2013-03-02,424,14680,1,35.62,0.0
1192062,2013-05-15,78,20673,1,91.54,0.0


In [51]:
transactions['coupon_used'] = transactions.coupon_discount.apply(lambda x: 1 if x != 0 else 0)
transactions.sample(5)

Unnamed: 0,date,customer_id,item_id,quantity,selling_price,coupon_discount,coupon_used
619982,2012-10-21,540,24128,1,63.76,0.0,0
1101557,2013-04-12,307,22907,1,32.41,0.0,0
1200290,2013-05-17,1551,54069,1,85.13,0.0,0
231318,2012-05-31,1142,5088,1,56.64,0.0,0
40432,2012-03-03,746,31822,1,75.16,0.0,0


In [52]:
# transactions['date'] = pd.to_datetime(transactions['date'])
# transactions_with_date = transactions.copy()
# transactions_with_date['day'] = transactions_with_date['date'].apply(lambda x: x.day)
# transactions_with_date['dow'] = transactions_with_date['date'].apply(lambda x: x.weekday())
# transactions_with_date['month'] = transactions_with_date['date'].apply(lambda x: x.month)
# transactions_with_date.drop('date', axis=1, inplace=True)
# transactions_with_date.sample(5)

In [53]:
transactions.drop('date', axis=1, inplace=True)

In [54]:
# Scale quantity to something more realistic in a clothing store
print(f'Min quantity: {transactions.quantity.min()}')
print(f'Max quantity: {transactions.quantity.max()}')
print(f'Mean quantity: {transactions.quantity.mean()}')
print(f'Median quantity: {transactions.quantity.median()}')
print(f'Mode quantity: {transactions.quantity.mode()[0]}')

# We want min to be 1 and max to be, e.g. 23, and also for the items to be integers
scaler = MinMaxScaler((1, 23))
transactions['quantity'] = scaler.fit_transform(transactions[['quantity']])
transactions['quantity'] = transactions['quantity'].round(decimals=0).astype(np.int64)

print('\nNew values:')
print(f'Min quantity: {transactions.quantity.min()}')
print(f'Max quantity: {transactions.quantity.max()}')
print(f'Mean quantity: {transactions.quantity.mean()}')
print(f'Median quantity: {transactions.quantity.median()}')
print(f'Mode quantity: {transactions.quantity.mode()[0]}')

Min quantity: 1
Max quantity: 89638
Mean quantity: 130.6633395391396
Median quantity: 1.0
Mode quantity: 1

New values:
Min quantity: 1
Max quantity: 23
Mean quantity: 1.0317772009850774
Median quantity: 1.0
Mode quantity: 1


## Merge customers, items, and transactions into one dataframe

In [55]:
df = transactions.merge(customers, on='customer_id', how='left')
df = df.merge(items, on='item_id', how='left')
df.sample(5)

Unnamed: 0,customer_id,item_id,quantity,selling_price,coupon_discount,coupon_used,age_range,marital_status,family_size,no_of_children,income_bracket,category
938671,330,46373,1,14.25,0.0,0,26-35,Married,4,2,4,Grocery
423809,1051,23379,1,43.1,0.0,0,46-55,Married,2,0,2,Packaged Meat
959855,1385,57398,1,53.075,0.0,0,36-45,Single,1,0,3,Pharmaceutical
1260141,479,37379,1,135.0,0.0,0,46-55,Married,2,0,10,Grocery
782061,411,30315,1,55.57,0.0,0,46-55,Married,2,0,6,Grocery


In [56]:
df.isnull().sum()

customer_id        0
item_id            0
quantity           0
selling_price      0
coupon_discount    0
coupon_used        0
age_range          0
marital_status     0
family_size        0
no_of_children     0
income_bracket     0
category           0
dtype: int64

## Map Categories

In [57]:
# Number of transactions per category
df.category.value_counts()

Grocery                   942176
Pharmaceutical            155245
Packaged Meat              66110
Natural Products           52981
Meat                       23056
Dairy, Juices & Snacks     20465
Bakery                     18218
Fuel                       13797
Prepared Food              13069
Seafood                     7043
Skin & Hair Care            4868
Miscellaneous               3197
Flowers & Plants            1602
Alcohol                     1343
Salads                       429
Garden                       413
Travel                       247
Restauarant                  211
Vegetables (cut)              96
Name: category, dtype: int64

In [58]:
# Check which categories are bought more by people with children
df['has_children'] = df.no_of_children.apply(lambda x: 1 if x > 0 else 0)
df['no_children'] = df.no_of_children.apply(lambda x: 1 if x == 0 else 0)

In [59]:
# Check which items are bought mainly by people with children
agg_items = df.groupby('item_id').agg({'has_children': np.sum, 'no_children': np.sum}).reset_index()
more_with_children = agg_items.loc[(agg_items['has_children'] - agg_items['no_children']) > (0.2 * agg_items['has_children'])]
items_for_children = more_with_children.item_id.values
num_transactions = more_with_children.has_children.sum() + more_with_children.no_children.sum()
num_transactions

89753

In [60]:
df.drop(['has_children', 'no_children'], axis=1, inplace=True)

In [61]:
df.loc[df.item_id.isin(items_for_children), 'category'] = np.random.choice(['Boys', 'Girls'], num_transactions)

In [62]:
df.category.value_counts()

Grocery                   893064
Pharmaceutical            129570
Packaged Meat              63495
Natural Products           51035
Girls                      45007
Boys                       44746
Meat                       22007
Bakery                     17208
Dairy, Juices & Snacks     16176
Fuel                       13789
Prepared Food              11846
Seafood                     6445
Skin & Hair Care            3775
Miscellaneous               2962
Flowers & Plants            1312
Alcohol                      854
Salads                       416
Garden                       367
Travel                       239
Restauarant                  158
Vegetables (cut)              95
Name: category, dtype: int64

In [63]:
# Split the grocery category, b/c it's too large
grocery_item_ids = df.loc[df.category == 'Grocery'].item_id.unique()
random.shuffle(grocery_item_ids)
sections = np.array_split(grocery_item_ids, 3)
df.loc[df.item_id.isin(sections[0]), 'category'] = 'Grocery 0'
df.loc[df.item_id.isin(sections[1]), 'category'] = 'Grocery 1'
df.loc[df.item_id.isin(sections[2]), 'category'] = 'Grocery 2'

In [64]:
df.category.value_counts()

Grocery 0                 312104
Grocery 2                 295711
Grocery 1                 285249
Pharmaceutical            129570
Packaged Meat              63495
Natural Products           51035
Girls                      45007
Boys                       44746
Meat                       22007
Bakery                     17208
Dairy, Juices & Snacks     16176
Fuel                       13789
Prepared Food              11846
Seafood                     6445
Skin & Hair Care            3775
Miscellaneous               2962
Flowers & Plants            1312
Alcohol                      854
Salads                       416
Garden                       367
Travel                       239
Restauarant                  158
Vegetables (cut)              95
Name: category, dtype: int64

In [65]:
to_women = ['Grocery 0', 'Pharmaceutical', 'Skin & Hair Care', 'Flowers & Plants', 'Salads', 'Garden', \
            'Vegetables (cut)', 'Dairy, Juices & Snacks', 'Prepared Food', 'Miscellaneous', 'Alcohol', 'Travel', \
            'Restauarant', 'Fuel', 'Natural Products']
to_men = ['Grocery 1', 'Packaged Meat', 'Meat', 'Seafood', 'Bakery']
to_sports = ['Grocery 2']

df.loc[df.category.isin(to_women), 'category'] = 'Women'
df.loc[df.category.isin(to_men), 'category'] = 'Men'
df.loc[df.category.isin(to_sports), 'category'] = 'Sports'

category_counts = df.category.value_counts()
category_counts

Women     544698
Men       394404
Sports    295711
Girls      45007
Boys       44746
Name: category, dtype: int64

### Add gender

In [66]:
df['gender'] = np.nan

df.loc[df.category == 'Women', 'gender'] = np.random.choice(['F', 'M'], category_counts['Women'], p=[0.9, 0.1])
df.loc[df.category == 'Men', 'gender'] = np.random.choice(['F', 'M'], category_counts['Men'], p=[0.2, 0.8])
df.loc[df.category == 'Sports', 'gender'] = np.random.choice(['F', 'M'], category_counts['Sports'], p=[0.4, 0.6])
df.loc[df.category == 'Girls', 'gender'] = np.random.choice(['F', 'M'], category_counts['Girls'], p=[0.2, 0.8])
df.loc[df.category == 'Boys', 'gender'] = np.random.choice(['F', 'M'], category_counts['Boys'], p=[0.2, 0.8])
df.gender.value_counts()

F    706017
M    618549
Name: gender, dtype: int64

## Feature engineering

In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1324566 entries, 0 to 1324565
Data columns (total 13 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   customer_id      1324566 non-null  int64  
 1   item_id          1324566 non-null  int64  
 2   quantity         1324566 non-null  int64  
 3   selling_price    1324566 non-null  float64
 4   coupon_discount  1324566 non-null  float64
 5   coupon_used      1324566 non-null  int64  
 6   age_range        1324566 non-null  object 
 7   marital_status   1324566 non-null  object 
 8   family_size      1324566 non-null  int64  
 9   no_of_children   1324566 non-null  int64  
 10  income_bracket   1324566 non-null  int64  
 11  category         1324566 non-null  object 
 12  gender           1324566 non-null  object 
dtypes: float64(2), int64(7), object(4)
memory usage: 141.5+ MB


In [68]:
pt_means = pd.pivot_table(df, index='customer_id',
                          values=['item_id', 'quantity', 'selling_price', 'coupon_discount'],
                          aggfunc={
                              'item_id': lambda x: len(set(x)),
                              'quantity': np.mean,
                              'selling_price': np.mean,
                              'coupon_discount': np.mean
                          })
pt_means.reset_index(inplace=True)
pt_means.rename(columns={
    'item_id': 'unique_items_per_cust',
    'quantity': 'mean_quantity_per_cust',
    'selling_price': 'mean_selling_price_per_cust',
    'coupon_discount': 'mean_discount_per_cust'}, inplace=True)
pt_means

Unnamed: 0,customer_id,mean_discount_per_cust,unique_items_per_cust,mean_quantity_per_cust,mean_selling_price_per_cust
0,1,-2.019876,463,1.000000,71.795406
1,2,-0.595084,352,1.000000,80.941393
2,3,-3.091546,406,1.002837,56.932745
3,4,-0.404773,125,1.000000,120.489458
4,5,-0.114684,490,1.027778,92.961449
...,...,...,...,...,...
1577,1578,-0.980358,481,1.034483,64.351497
1578,1579,-0.291213,639,1.000000,82.581198
1579,1580,0.000000,422,1.009452,89.473090
1580,1581,0.000000,390,1.071287,66.668103


In [69]:
pt_sums = pd.pivot_table(df, index='customer_id',
                         values=['item_id', 'quantity', 'selling_price', 'coupon_discount', 'coupon_used'],
                         aggfunc={
                             'item_id': len,
                             'quantity': np.sum,
                             'selling_price': np.sum,
                             'coupon_discount': np.sum,
                             'coupon_used': np.sum
                         })
pt_sums.reset_index(inplace=True)
pt_sums.rename(columns={
    'item_id': 'total_items_per_cust',
    'quantity': 'total_quantity_per_cust',
    'selling_price': 'total_selling_price_per_cust',
    'coupon_discount': 'total_discount_per_cust',
    'coupon_used': 'total_coupons_used_per_cust'}, inplace=True)
pt_sums

Unnamed: 0,customer_id,total_discount_per_cust,total_coupons_used_per_cust,total_items_per_cust,total_quantity_per_cust,total_selling_price_per_cust
0,1,-2116.83,78,1048,1048,75241.585333
1,2,-249.34,4,419,419,33914.443500
2,3,-2179.54,53,705,707,40137.585039
3,4,-89.05,1,220,220,26507.680667
4,5,-90.83,2,792,814,73625.467667
...,...,...,...,...,...,...
1577,1578,-767.62,29,783,810,50387.221827
1578,1579,-338.39,6,1162,1162,95959.352500
1579,1580,0.00,0,529,534,47331.264671
1580,1581,0.00,0,505,541,33667.391791


In [70]:
cust_stats = pd.merge(pt_means, pt_sums, on='customer_id')
cust_stats

Unnamed: 0,customer_id,mean_discount_per_cust,unique_items_per_cust,mean_quantity_per_cust,mean_selling_price_per_cust,total_discount_per_cust,total_coupons_used_per_cust,total_items_per_cust,total_quantity_per_cust,total_selling_price_per_cust
0,1,-2.019876,463,1.000000,71.795406,-2116.83,78,1048,1048,75241.585333
1,2,-0.595084,352,1.000000,80.941393,-249.34,4,419,419,33914.443500
2,3,-3.091546,406,1.002837,56.932745,-2179.54,53,705,707,40137.585039
3,4,-0.404773,125,1.000000,120.489458,-89.05,1,220,220,26507.680667
4,5,-0.114684,490,1.027778,92.961449,-90.83,2,792,814,73625.467667
...,...,...,...,...,...,...,...,...,...,...
1577,1578,-0.980358,481,1.034483,64.351497,-767.62,29,783,810,50387.221827
1578,1579,-0.291213,639,1.000000,82.581198,-338.39,6,1162,1162,95959.352500
1579,1580,0.000000,422,1.009452,89.473090,0.00,0,529,534,47331.264671
1580,1581,0.000000,390,1.071287,66.668103,0.00,0,505,541,33667.391791


In [71]:
df = pd.merge(df, cust_stats, on='customer_id', how='left')

In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1324566 entries, 0 to 1324565
Data columns (total 22 columns):
 #   Column                        Non-Null Count    Dtype  
---  ------                        --------------    -----  
 0   customer_id                   1324566 non-null  int64  
 1   item_id                       1324566 non-null  int64  
 2   quantity                      1324566 non-null  int64  
 3   selling_price                 1324566 non-null  float64
 4   coupon_discount               1324566 non-null  float64
 5   coupon_used                   1324566 non-null  int64  
 6   age_range                     1324566 non-null  object 
 7   marital_status                1324566 non-null  object 
 8   family_size                   1324566 non-null  int64  
 9   no_of_children                1324566 non-null  int64  
 10  income_bracket                1324566 non-null  int64  
 11  category                      1324566 non-null  object 
 12  gender                      

In [73]:
df.loc[df.coupon_discount != 0].drop(['item_id', 'quantity', 'selling_price', 'coupon_discount', 'coupon_used', 'category'], axis=1)

Unnamed: 0,customer_id,age_range,marital_status,family_size,no_of_children,income_bracket,gender,mean_discount_per_cust,unique_items_per_cust,mean_quantity_per_cust,mean_selling_price_per_cust,total_discount_per_cust,total_coupons_used_per_cust,total_items_per_cust,total_quantity_per_cust,total_selling_price_per_cust
88,464,46-55,Married,5,3,3,M,-1.826026,2040,1.043541,61.271311,-7548.79,220,4134,4314,253295.597842
93,464,46-55,Married,5,3,3,M,-1.826026,2040,1.043541,61.271311,-7548.79,220,4134,4314,253295.597842
104,464,46-55,Married,5,3,3,F,-1.826026,2040,1.043541,61.271311,-7548.79,220,4134,4314,253295.597842
107,464,46-55,Married,5,3,3,M,-1.826026,2040,1.043541,61.271311,-7548.79,220,4134,4314,253295.597842
112,464,46-55,Married,5,3,3,M,-1.826026,2040,1.043541,61.271311,-7548.79,220,4134,4314,253295.597842
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1324282,384,36-45,Single,1,0,3,F,-0.791879,372,1.016779,68.834125,-471.96,16,596,606,41025.138695
1324285,384,36-45,Single,1,0,3,F,-0.791879,372,1.016779,68.834125,-471.96,16,596,606,41025.138695
1324287,384,36-45,Single,1,0,3,M,-0.791879,372,1.016779,68.834125,-471.96,16,596,606,41025.138695
1324403,1303,56-70,Married,2,0,5,F,-0.579160,472,1.010687,49.810888,-379.35,14,655,662,32626.131523


### Merge with train

In [74]:
# train = pd.read_csv('original_data/train.csv')
# train.info()

In [75]:
# train = pd.merge(train, df, on='customer_id', how='left')

In [76]:
df.drop(['customer_id', 'item_id', 'quantity'], axis=1, inplace=True)

### Encoding

In [77]:
encoder = LabelEncoder()
df['age_range'] = encoder.fit_transform(df['age_range'])
print('Age range encoding:')
for i, c in enumerate(encoder.classes_):
    print(f'{i}: {c}')

Age range encoding:
0: 18-25
1: 26-35
2: 36-45
3: 46-55
4: 56-70
5: 70+


In [78]:
df['marital_status'] = encoder.fit_transform(df['marital_status'])
print('Marital status encoding:')
for i, c in enumerate(encoder.classes_):
    print(f'{i}: {c}')

Marital status encoding:
0: Married
1: Single


In [79]:
df['gender'] = encoder.fit_transform(df['gender'])
print('Gender encoding:')
for i, c in enumerate(encoder.classes_):
    print(f'{i}: {c}')

Gender encoding:
0: F
1: M


In [80]:
df = pd.get_dummies(df, columns=['category'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1324566 entries, 0 to 1324565
Data columns (total 23 columns):
 #   Column                        Non-Null Count    Dtype  
---  ------                        --------------    -----  
 0   selling_price                 1324566 non-null  float64
 1   coupon_discount               1324566 non-null  float64
 2   coupon_used                   1324566 non-null  int64  
 3   age_range                     1324566 non-null  int64  
 4   marital_status                1324566 non-null  int64  
 5   family_size                   1324566 non-null  int64  
 6   no_of_children                1324566 non-null  int64  
 7   income_bracket                1324566 non-null  int64  
 8   gender                        1324566 non-null  int64  
 9   mean_discount_per_cust        1324566 non-null  float64
 10  unique_items_per_cust         1324566 non-null  int64  
 11  mean_quantity_per_cust        1324566 non-null  float64
 12  mean_selling_price_per_cust 

In [82]:
df.columns


Index(['selling_price', 'coupon_discount', 'coupon_used', 'age_range',
       'marital_status', 'family_size', 'no_of_children', 'income_bracket',
       'gender', 'mean_discount_per_cust', 'unique_items_per_cust',
       'mean_quantity_per_cust', 'mean_selling_price_per_cust',
       'total_discount_per_cust', 'total_coupons_used_per_cust',
       'total_items_per_cust', 'total_quantity_per_cust',
       'total_selling_price_per_cust', 'category_Boys', 'category_Girls',
       'category_Men', 'category_Sports', 'category_Women'],
      dtype='object')

### Save to a file

In [81]:
if not os.path.exists('final_data'):
    os.mkdir('final_data')
df.to_csv('final_data/final_data_new_categories.csv', index=False)