# 5. Extract Community Features

In [1]:
import pandas as pd
import numpy as np

# Import Data

In [2]:
# import new data after feature engineering
wires = pd.read_csv('wires.csv').iloc[:, 1:]
emts = pd.read_csv('emts.csv').iloc[:, 1:]
cash = pd.read_csv('cash.csv').iloc[:, 1:]
cust_info = pd.read_csv('cust_info.csv').iloc[:, 1:]
detail_cust = pd.read_csv('detailed_cust_info.csv').iloc[:, 1:]
ext_info = pd.read_csv('external_info.csv').iloc[:, 1:]
coms_df = pd.read_csv('Community_labels.csv').iloc[:, 1:]

In [3]:
# convert the amount in cash df to type float
cash['amount'] = cash['amount'].astype(float)

# Merge Community Labels to DataFrames

In [4]:
coms_df

Unnamed: 0,trxn_id,Community Label
0,LWCS42954834,127
1,NTTG55749308,7254
2,IXVD84599097,276
3,SLBV29462341,68
4,ERLU26785367,15
...,...,...
457416,JOQU43611104,871
457417,LTBH81014009,236
457418,GGHM25093698,213115
457419,CNXP31340871,213116


# Append Community Labels to Transaction Data

In [5]:
# create a function to get the community labels for each customer's transactions
def get_labels(df):
    """
    """
    # Merge and keep needed columns for feature extraction by customer
    df1 = pd.merge(df, coms_df, how='left', on='trxn_id')[['trxn_id', 'sender_global_id', 'rec_global_id', 'Community Label']]
    
    # create the dataframe for each customer
    send_ls = df1[['sender_global_id', 'Community Label']].rename(columns={'sender_global_id': 'rec_global_id'})
    rec_ls = df1[['rec_global_id', 'Community Label']]
    
    # concat the two dataframes
    labels = pd.concat([send_ls, rec_ls]).rename(columns={'rec_global_id':'Global_id'})
    
    return labels

In [6]:
# Get the dataframe for wire transactions
wire_labels = get_labels(wires)
wire_labels

Unnamed: 0,Global_id,Community Label
0,46393,127
1,101584,7254
2,184551,276
3,71803,68
4,117672,15
...,...,...
48287,107141,61
48288,155051,310
48289,167767,11353
48290,96357,96


In [7]:
# Get the dataframe for emt transactions
emt_labels = get_labels(emts)
emt_labels

Unnamed: 0,Global_id,Community Label
0,166925,11354
1,68825,95
2,24521,165
3,77547,153
4,156121,11355
...,...,...
318895,91980,188458
318896,91130,2501
318897,82331,188459
318898,112810,38


In [8]:
# Get labels for cash transactions (since only one customer no function needed)
cash_labels = pd.merge(cash, coms_df, how='left', on='trxn_id')[['Global_id', 'Community Label']]
cash_labels

Unnamed: 0,Global_id,Community Label
0,96095,2537
1,102552,188461
2,45375,16
3,39239,188462
4,80360,188463
...,...,...
90224,85640,871
90225,105558,236
90226,42954,213115
90227,23527,213116


# Feature Extraction using Group By

### Concat dataframes to single labels dataframe

In [9]:
# Concat the dataframes together
labels = pd.concat([wire_labels, emt_labels, cash_labels])
labels['Community Label'] = labels['Community Label'].astype(str)
labels

Unnamed: 0,Global_id,Community Label
0,46393,127
1,101584,7254
2,184551,276
3,71803,68
4,117672,15
...,...,...
90224,85640,871
90225,105558,236
90226,42954,213115
90227,23527,213116


### Get Count Unique Communities by User

In [10]:
# Create a groupby to count the number of unique communities
# a user participated in
uniq_coms = pd.DataFrame(labels.groupby(['Global_id'])['Community Label'].nunique())
uniq_coms = uniq_coms.rename(columns={'Community Label': 'Num_unique_communities_participated'}).reset_index()
uniq_coms

Unnamed: 0,Global_id,Num_unique_communities_participated
0,0,1
1,1,1
2,2,1
3,3,1
4,4,6
...,...,...
196000,196053,2
196001,196054,4
196002,196055,6
196003,196056,3


### Get total number of communities participated

In [11]:
# Create a groupby to count the number of unique communities
# a user participated in
total_coms = pd.DataFrame(labels.groupby(['Global_id'])['Community Label'].count())
total_coms = total_coms.rename(columns={'Community Label': 'Total_communities_participated'}).reset_index()
total_coms

Unnamed: 0,Global_id,Total_communities_participated
0,0,1
1,1,1
2,2,1
3,3,1
4,4,20
...,...,...
196000,196053,2
196001,196054,4
196002,196055,11
196003,196056,3


# Merge the dataframes

In [12]:
comm_fts = pd.merge(uniq_coms, total_coms, on='Global_id')
comm_fts

Unnamed: 0,Global_id,Num_unique_communities_participated,Total_communities_participated
0,0,1,1
1,1,1,1
2,2,1,1
3,3,1,1
4,4,6,20
...,...,...,...
196000,196053,2,2
196001,196054,4,4
196002,196055,6,11
196003,196056,3,3


### Process Note

Initially we planned on adding a binary variable flag for each community to indicate whether or not an individual participated in the community. However, due to the large number of communities, this is not feasible we could not construct a matrix of that size due to memory constraints. 

# Save the DataFrame

In [13]:
comm_fts.to_csv('comm_fts.csv')