# 6. Merge Extract Features to Customers Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

# Import Data

In [2]:
# import new data after feature engineering
detailed_cust_info = pd.read_csv('detailed_cust_info.csv').iloc[:, 1:]
external_info = pd.read_csv('external_info.csv').iloc[:, 1:]

In [3]:
# Import extract community features data
comm_fts = pd.read_csv('comm_fts.csv').iloc[:, 1:]

In [4]:
# import participation data
wsw_part = pd.read_csv('wsw_part.csv').iloc[:, 1:]
wse_part = pd.read_csv('wse_part.csv').iloc[:, 1:]
wsc_part = pd.read_csv('wsc_part.csv').iloc[:, 1:]
ese_part = pd.read_csv('ese_part.csv').iloc[:, 1:]
esc_part = pd.read_csv('esc_part.csv').iloc[:, 1:]
esw_part = pd.read_csv('esw_part.csv').iloc[:, 1:]
csc_part = pd.read_csv('csc_part.csv').iloc[:, 1:]
csw_part = pd.read_csv('csw_part.csv').iloc[:, 1:]
cse_part = pd.read_csv('cse_part.csv').iloc[:, 1:]

# Merge Pariticipation Features

In [5]:
# create a list of the dataframes
parts = [wse_part, wsc_part, 
         ese_part, esc_part, esw_part, 
         csc_part, csw_part, cse_part]

In [6]:
# Merge the dataframs to one dataframe
parts_df = wsw_part.copy()

for i in range(len(parts)):
    parts_df = pd.merge(parts_df, parts[i], how='outer', on='Global_id', suffixes=(i, i+1))

In [7]:
# create a list of column names
part_cols = ['Global_id', 'wsw_part', 'wse_part', 'wsc_part', 
             'ese_part', 'esc_part', 'esw_part', 
             'csc_part', 'csw_part', 'cse_part']

# Set column names
parts_df.columns = part_cols

In [8]:
# Fill np.nans with 0 since nans appear after merging indicate
# an individual had participated in another network but didn't
# in the current one
parts_df = parts_df.fillna(0)
parts_df

Unnamed: 0,Global_id,wsw_part,wse_part,wsc_part,ese_part,esc_part,esw_part,csc_part,csw_part,cse_part
0,14234,72.0,75.0,67.0,37.0,30.0,54.0,228.0,32.0,25.0
1,39393,71.0,105.0,1.0,137.0,0.0,88.0,0.0,9.0,9.0
2,25911,65.0,63.0,1.0,79.0,4.0,75.0,36.0,6.0,6.0
3,77559,58.0,67.0,37.0,68.0,27.0,108.0,174.0,26.0,26.0
4,61010,54.0,52.0,3.0,29.0,0.0,62.0,0.0,9.0,0.0
...,...,...,...,...,...,...,...,...,...,...
192847,183272,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
192848,128379,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
192849,66689,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
192850,40368,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


### Merge pariticipation data to `Customer Details` DF

In [9]:
# Merge the participation data with the detailed_cust_info
# Given there are more nans, fill them with 0 (indicates no participation)
detail_cust = pd.merge(detailed_cust_info, parts_df, how='left', on='Global_id').fillna(0)
detail_cust

Unnamed: 0,Global_id,Age,Tenure,label,Gender_female,Gender_male,Occupation_num,Num_wires_sent,Avg_wire_amt_sent,Num_wires_received,...,Avg_emt_sent_int,wsw_part,wse_part,wsc_part,ese_part,esc_part,esw_part,csc_part,csw_part,cse_part
0,102123,45.0,13.0,0,1,0,8,2,1985.75,0,...,2085.250,1.0,1.0,0.0,99.0,0.0,20.0,0.0,0.0,0.0
1,83128,52.0,8.0,0,0,1,146,0,0.00,0,...,0.000,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
2,80244,43.0,11.0,0,1,0,115,2,14131.50,4,...,1081.500,9.0,27.0,0.0,140.0,0.0,27.0,29.0,4.0,14.0
3,93124,39.0,21.0,1,0,1,195,2,5214.50,3,...,273.125,9.0,34.0,8.0,62.0,16.0,29.0,172.0,23.0,45.0
4,44548,26.0,8.0,0,1,0,212,0,0.00,0,...,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126178,80822,37.0,12.0,1,1,0,186,0,0.00,0,...,31.250,0.0,0.0,0.0,16.0,19.0,1.0,222.0,0.0,42.0
126179,18375,46.0,3.0,0,0,1,211,0,0.00,0,...,284.750,0.0,4.0,0.0,96.0,6.0,3.0,0.0,0.0,18.0
126180,10812,31.0,10.0,0,0,1,143,0,0.00,0,...,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
126181,23226,21.0,3.0,0,1,0,235,0,0.00,1,...,590.000,0.0,9.0,0.0,81.0,0.0,6.0,0.0,0.0,2.0


### Merge participation data to `External Info` DF

In [10]:
# Merge partitipcation df to external dataframe as well
ext_info = pd.merge(external_info, parts_df, how='left', on='Global_id').fillna(0)
ext_info

Unnamed: 0,Global_id,Num_wires_sent,Avg_wire_amt_sent,Num_wires_received,Avg_wire_amt_received,Num_emts_sent,Avg_emt_amt_sent,Num_emts_received,Avg_emt_amt_received,Country_AU,...,Country_US,wsw_part,wse_part,wsc_part,ese_part,esc_part,esw_part,csc_part,csw_part,cse_part
0,126184,1,1546.0,0,0.0,0,0.000000,0,0.00,1,...,0,3.0,10.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
1,126199,1,1046.0,0,0.0,9,419.888889,10,797.35,0,...,0,1.0,8.0,0.0,164.0,4.0,12.0,0.0,0.0,2.0
2,126200,1,1444.0,0,0.0,0,0.000000,0,0.00,0,...,0,1.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,126201,1,3710.0,1,4011.0,8,1643.000000,5,840.40,0,...,0,3.0,16.0,1.0,76.0,3.0,12.0,0.0,0.0,18.0
4,126208,1,38450.0,0,0.0,0,0.000000,0,0.00,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69870,196051,0,0.0,0,0.0,1,403.000000,2,494.50,0,...,0,0.0,8.0,0.0,19.0,0.0,0.0,0.0,0.0,0.0
69871,196052,0,0.0,0,0.0,1,1128.000000,2,1065.25,0,...,0,0.0,0.0,0.0,17.0,8.0,3.0,0.0,0.0,14.0
69872,196054,0,0.0,0,0.0,2,377.500000,2,166.50,0,...,0,0.0,1.0,0.0,23.0,4.0,1.0,0.0,0.0,4.0
69873,196056,0,0.0,0,0.0,3,9.333333,0,0.00,0,...,0,0.0,0.0,0.0,10.0,7.0,3.0,0.0,0.0,0.0


# Merge the Community Features

In [11]:
# Merge to detail_cust DF
detail_cust1 = pd.merge(detail_cust, comm_fts, how='left', on='Global_id').fillna(0)
detail_cust1

Unnamed: 0,Global_id,Age,Tenure,label,Gender_female,Gender_male,Occupation_num,Num_wires_sent,Avg_wire_amt_sent,Num_wires_received,...,wse_part,wsc_part,ese_part,esc_part,esw_part,csc_part,csw_part,cse_part,Num_unique_communities_participated,Total_communities_participated
0,102123,45.0,13.0,0,1,0,8,2,1985.75,0,...,1.0,0.0,99.0,0.0,20.0,0.0,0.0,0.0,1.0,17.0
1,83128,52.0,8.0,0,0,1,146,0,0.00,0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2,80244,43.0,11.0,0,1,0,115,2,14131.50,4,...,27.0,0.0,140.0,0.0,27.0,29.0,4.0,14.0,2.0,26.0
3,93124,39.0,21.0,1,0,1,195,2,5214.50,3,...,34.0,8.0,62.0,16.0,29.0,172.0,23.0,45.0,8.0,29.0
4,44548,26.0,8.0,0,1,0,212,0,0.00,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126178,80822,37.0,12.0,1,1,0,186,0,0.00,0,...,0.0,0.0,16.0,19.0,1.0,222.0,0.0,42.0,8.0,21.0
126179,18375,46.0,3.0,0,0,1,211,0,0.00,0,...,4.0,0.0,96.0,6.0,3.0,0.0,0.0,18.0,16.0,16.0
126180,10812,31.0,10.0,0,0,1,143,0,0.00,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
126181,23226,21.0,3.0,0,1,0,235,0,0.00,1,...,9.0,0.0,81.0,0.0,6.0,0.0,0.0,2.0,9.0,16.0


In [12]:
# Merge to ext_info DF
ext_info1 = pd.merge(ext_info, comm_fts, how='left', on='Global_id').fillna(0)
ext_info1

Unnamed: 0,Global_id,Num_wires_sent,Avg_wire_amt_sent,Num_wires_received,Avg_wire_amt_received,Num_emts_sent,Avg_emt_amt_sent,Num_emts_received,Avg_emt_amt_received,Country_AU,...,wse_part,wsc_part,ese_part,esc_part,esw_part,csc_part,csw_part,cse_part,Num_unique_communities_participated,Total_communities_participated
0,126184,1,1546.0,0,0.0,0,0.000000,0,0.00,1,...,10.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1
1,126199,1,1046.0,0,0.0,9,419.888889,10,797.35,0,...,8.0,0.0,164.0,4.0,12.0,0.0,0.0,2.0,18,20
2,126200,1,1444.0,0,0.0,0,0.000000,0,0.00,0,...,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1
3,126201,1,3710.0,1,4011.0,8,1643.000000,5,840.40,0,...,16.0,1.0,76.0,3.0,12.0,0.0,0.0,18.0,3,15
4,126208,1,38450.0,0,0.0,0,0.000000,0,0.00,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69870,196051,0,0.0,0,0.0,1,403.000000,2,494.50,0,...,8.0,0.0,19.0,0.0,0.0,0.0,0.0,0.0,3,3
69871,196052,0,0.0,0,0.0,1,1128.000000,2,1065.25,0,...,0.0,0.0,17.0,8.0,3.0,0.0,0.0,14.0,3,3
69872,196054,0,0.0,0,0.0,2,377.500000,2,166.50,0,...,1.0,0.0,23.0,4.0,1.0,0.0,0.0,4.0,4,4
69873,196056,0,0.0,0,0.0,3,9.333333,0,0.00,0,...,0.0,0.0,10.0,7.0,3.0,0.0,0.0,0.0,3,3


# Save Updated Dataframes

In [13]:
# Save
detail_cust1.to_csv('detailed_cust_info.csv')
ext_info1.to_csv('external_info.csv')