In [1]:
# Import Required Libraries and Dataset

# Data-science Library Imports
import pandas as pd
import numpy as np

# Math Imports
import math

# Plot Imports
import matplotlib.pyplot as plt
import seaborn as sns

# Set plot syles
sns.set_style('whitegrid')
%matplotlib inline

# Machine Learning Imports
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

#Import Dataset
data = pd.read_excel('Combined data_province and age.xlsx')

In [2]:
# Preview of data
data.head()

Unnamed: 0,Country,Province,Age Group,Age,BirthYear,Gender,EmploymentType,JobType,InformalWorker,Informal Work Type,...,AidSource_Charities/ Donations,AidSource_Friends/family,AidSource_Government,AidSource_Not sure,AidSource_Other,AidSource_Religious Organizations,COVIDLoans,MobileMoneyActivity,MobileMoneyDeposit,GovernmentPriority
0,Ivory Coast (Cote D'Ivoire),Gôh-Djiboua,15-25,24,1995,Male,Student,,,,...,,,,,,,No,Decreased,Yes,Protecting people from COVID-19
1,Ivory Coast (Cote D'Ivoire),Montagnes,15-25,21,1998,Female,Self-employed,,Yes,Hair/Skin/Nail services,...,,,,,,,Yes,Don’t know,Yes,Reopening the economy
2,Ivory Coast (Cote D'Ivoire),Abidjan,36+,50,1969,Male,Other,,,,...,,,,,,,Yes,Stayed the same,No,Protecting people from COVID-19
3,Ivory Coast (Cote D'Ivoire),Abidjan,26-35,33,1986,Female,Self-employed,,Yes,Hair/Skin/Nail services,...,,,,,,,Yes,Increased,Yes,Reopening the economy
4,Ivory Coast (Cote D'Ivoire),Montagnes,15-25,19,2000,Male,Other,,,,...,,,,,,,Yes,Stayed the same,No,Protecting people from COVID-19


In [3]:
# Convert into data frame
data_df = pd.DataFrame(data = data)

In [4]:
# Data Preparation
# Country, province,Age, Birth Year columns dropped
data_prepared = data_df.drop(columns=['Country','Province','Age','BirthYear'])

In [5]:
# Age Group
# check for missing values
data_prepared['Age Group'].isnull().sum() 

0

In [6]:
# Gender
# Check for missing values
data_prepared['Gender'].isnull().sum()

0

In [7]:
# Employment type
# Check for missing values
data_prepared['EmploymentType'].isnull().sum() # No missing values

0

In [8]:
# Dropping Job Type, Infromal worker, Informal work type, Other job columns
data_prepared = data_prepared.drop(columns=['JobType','InformalWorker','Informal Work Type','OtherJob'])

In [9]:
# Job Loss
# Check for missing values
data_prepared['JobLoss'].isnull().sum()
# Replace missing values with Not Applicable
data_prepared["JobLoss"].fillna('Job loss-Not Applicable', inplace = True)
# Change column values to readable format, we are keeping same number of levels here
data_prepared["JobLoss"].replace({"No-I’m still able to work": "Job Not lost due to Covid", 
                                  "Yes": "Job Lost due to Covid", 
                                  "Prefer not to say":"Prefer not stay about Job loss"}, inplace=True)

In [10]:
# Job Regain
data_prepared['JobRegain'].isnull().sum()
# Replace missing values with Not Applicable
data_prepared["JobRegain"].fillna('Job regain-Not Applicable', inplace = True)
# Change column values to readable format, we are keeping same number of levels here
data_prepared["JobRegain"].replace({"Don't know": "Not sure about job regain", 
                                    "No": "Cannot regain job", 
                                    "Yes":"Can regain job"}, inplace=True)

In [11]:
# Monthly income dropped--No proper data entry, can deal with montly income bracket instead
data_prepared = data_prepared.drop(columns=['MonthlyIncome'])

In [12]:
# Monthly income bracket
# Check for missing values
data_prepared['MonthlyIncome Bracket'].isnull().sum() #1218 Missing values
# Check for the employmnet type of corresponding missing values
for i in range(len(data_prepared)):
  if pd.isna(data_prepared.loc[i, "MonthlyIncome Bracket"]):
    print(data_prepared.loc[i, "EmploymentType"])
# Replace NaN with first bracket values
data_prepared["MonthlyIncome Bracket"].fillna('Income between 0-10000', inplace = True) 
data_prepared['MonthlyIncome Bracket'] = data_prepared['MonthlyIncome Bracket'].apply(str)
# change levels to readable format
data_prepared["MonthlyIncome Bracket"].replace({"11.0": "Income between 0-10000",
                                         "1.0": "Income between 0-10000", 
                                         "2.0": "Income between 10001-20000",
                                         "3.0": "Income between 20001-50000",
                                         "4.0": "Income between 50001-100000",
                                         "5.0": "Income between 100001-200000",
                                         "6.0": "Income above 200000"}, inplace=True)


Student
Unemployed
Unemployed
Student
Student
Student
Unemployed
Unemployed
Student
Unemployed
Student
Unemployed
Unemployed
Unemployed
Student
Student
Unemployed
Unemployed
Unemployed
Student
Unemployed
Unemployed
Unemployed
Student
Unemployed
Unemployed
Unemployed
Student
Student
Unemployed
Unemployed
Unemployed
Student
Unemployed
Student
Unemployed
Unemployed
Unemployed
Student
Student
Unemployed
Unemployed
Student
Student
Student
Student
Student
Unemployed
Unemployed
Unemployed
Unemployed
Unemployed
Unemployed
Unemployed
Unemployed
Unemployed
Student
Unemployed
Student
Unemployed
Student
Unemployed
Student
Unemployed
Unemployed
Unemployed
Unemployed
Unemployed
Unemployed
Student
Student
Unemployed
Student
Unemployed
Student
Student
Student
Unemployed
Student
Unemployed
Student
Unemployed
Student
Unemployed
Unemployed
Student
Unemployed
Unemployed
Unemployed
Student
Student
Student
Unemployed
Unemployed
Unemployed
Student
Student
Unemployed
Unemployed
Unemployed
Student
Student
Stud

In [13]:
# Income Change
# Check for misisng values
data_prepared['IncomeChange'].isnull().sum() #1213 Missing values
# Replace NaN with 'Not Applicable'
data_prepared["IncomeChange"].fillna('Income Change-Not Applicable', inplace = True)
# Change column values to readable format, we are keeping same number of levels here
data_prepared["IncomeChange"].replace({"Decreased a bit": "Income change decreased", 
                                                 "Decreased a lot": "Income change decreased",
                                                 "Increased a bit": "Income change increased",
                                                 "Increased a lot": "Income change increased",
                                                 "No change": "No change in income"}, inplace=True)

In [14]:
#Expense Responsibility
data_prepared['ExpenseResponsibility'].isnull().sum()
# Change column values to readable format, we are keeping same number of levels here
data_prepared["ExpenseResponsibility"].replace({"No": "No Expense Responsibility", 
                                                "Yes": "Has Expense Responsibility"}, inplace=True)

In [15]:
# Length survival
data_prepared['LengthSurvival'].isnull().sum() # 600 missing values
# Replace missing values with Not Applicable
data_prepared["LengthSurvival"].fillna('Unsure about length survival', inplace = True)
# Change column values to readable format, We are grouping levels here
data_prepared["LengthSurvival"].replace({"Don’t know": "Unsure about length survival", 
                                         "Don’t know": "Unsure about length survival",
                                         "5+ months": "Can survive for 5+ months",
                                         "2-4 months": "Can survive for 0-4 months",
                                         "1 month": "Can Survive for 0-4 months",
                                         "< a month": "Can survive for 0-4 months"}, inplace=True)

In [16]:
# Money for expenses
# Check for missing values
data_prepared['MoneyForExpenses'].isnull().sum() # 600 missing values
# Check for corresponding Employmnet type and expense responsibility for missing data
for i in range(len(data_prepared)):
  if pd.isna(data_prepared.loc[i, "MoneyForExpenses"]):
    print(data_prepared.loc[i, "ExpenseResponsibility"])
    print(data_prepared.loc[i, "EmploymentType"])
# Replace missing values with 'Other way for expenses'
data_prepared["MoneyForExpenses"].fillna('Other way for expenses', inplace = True)
# Change column values to readable format, we are keeping same number of levels here
data_prepared["MoneyForExpenses"].replace({"Other": "Other way for expenses"}, inplace=True)

No Expense Responsibility
Student
No Expense Responsibility
Self-employed
No Expense Responsibility
Employed part time
No Expense Responsibility
Employed full time
No Expense Responsibility
Employed part time
No Expense Responsibility
Student
No Expense Responsibility
Self-employed
No Expense Responsibility
Student
No Expense Responsibility
Unemployed
No Expense Responsibility
Unemployed
No Expense Responsibility
Unemployed
No Expense Responsibility
Student
No Expense Responsibility
Unemployed
No Expense Responsibility
Student
No Expense Responsibility
Unemployed
No Expense Responsibility
Unemployed
No Expense Responsibility
Student
No Expense Responsibility
Unemployed
No Expense Responsibility
Student
No Expense Responsibility
Unemployed
No Expense Responsibility
Self-employed
No Expense Responsibility
Unemployed
No Expense Responsibility
Employed part time
No Expense Responsibility
Unemployed
No Expense Responsibility
Employed full time
No Expense Responsibility
Unemployed
No Expense

No Expense Responsibility
Unemployed
No Expense Responsibility
Unemployed
No Expense Responsibility
Employed part time
No Expense Responsibility
Student
No Expense Responsibility
Student
No Expense Responsibility
Employed full time
No Expense Responsibility
Unemployed
No Expense Responsibility
Employed full time
No Expense Responsibility
Unemployed
No Expense Responsibility
Unemployed
No Expense Responsibility
Employed full time
No Expense Responsibility
Employed full time
No Expense Responsibility
Employed full time
No Expense Responsibility
Unemployed
No Expense Responsibility
Student
No Expense Responsibility
Unemployed
No Expense Responsibility
Unemployed
No Expense Responsibility
Unemployed
No Expense Responsibility
Unemployed
No Expense Responsibility
Unemployed
No Expense Responsibility
Unemployed
No Expense Responsibility
Unemployed
No Expense Responsibility
Unemployed
No Expense Responsibility
Unemployed
No Expense Responsibility
Employed part time
No Expense Responsibility
Em

In [17]:
# ConcernExpenses
# Check for missing values
data_prepared['ConcernExpenses'].isnull().sum() # 600 missing values
# Check for corresponding Employmnet type and expense responsibility for missing data
for i in range(len(data_prepared)):
  if pd.isna(data_prepared.loc[i, "ConcernExpenses"]):
    print(data_prepared.loc[i, "ExpenseResponsibility"])
    print(data_prepared.loc[i, "EmploymentType"])
# Replace missing data with 'No change'
data_prepared["ConcernExpenses"].fillna('No change in concern for expenses', inplace = True)
# Change column values to readable format, we are keeping same number of levels here
data_prepared["ConcernExpenses"].replace({"Less concerned": "Less concerned about expenses than before covid",
                                          "More concerned": "More concerned about expenses than before covid",
                                          "No change": "No change in concern for expenses"}, inplace=True)

No Expense Responsibility
Student
No Expense Responsibility
Self-employed
No Expense Responsibility
Employed part time
No Expense Responsibility
Employed full time
No Expense Responsibility
Employed part time
No Expense Responsibility
Student
No Expense Responsibility
Self-employed
No Expense Responsibility
Student
No Expense Responsibility
Unemployed
No Expense Responsibility
Unemployed
No Expense Responsibility
Unemployed
No Expense Responsibility
Student
No Expense Responsibility
Unemployed
No Expense Responsibility
Student
No Expense Responsibility
Unemployed
No Expense Responsibility
Unemployed
No Expense Responsibility
Student
No Expense Responsibility
Unemployed
No Expense Responsibility
Student
No Expense Responsibility
Unemployed
No Expense Responsibility
Self-employed
No Expense Responsibility
Unemployed
No Expense Responsibility
Employed part time
No Expense Responsibility
Unemployed
No Expense Responsibility
Employed full time
No Expense Responsibility
Unemployed
No Expense

In [18]:
# Expense concern rating
# Check for missing values
data_prepared['Expense Concern Rating'].isnull().sum() # 600 missing values
# Check for corresponding Employmnet type and expense responsibility for missing data
for i in range(len(data_prepared)):
  if pd.isna(data_prepared.loc[i, "Expense Concern Rating"]):
    print(data_prepared.loc[i, "ExpenseResponsibility"])
    print(data_prepared.loc[i, "EmploymentType"])
# Replace missing data with 'Not concerned'
data_prepared["Expense Concern Rating"].fillna('Not concerned for next 6 months', inplace = True)
# Convert the column into string--Association needs data in string
data_prepared['Expense Concern Rating'] = data_prepared['Expense Concern Rating'].apply(str) 
# Change column values to readable format the number of levels are reduced
data_prepared["Expense Concern Rating"].replace({"1.0": "Not concerned for next 6 months",
                                                 "1.4": "Not concerned for next 6 months",
                                                 "2.0": "Reasonably concerned for next 6 months",
                                                 "3.0": "Reasonably concerned for next 6 months",
                                                 "4.0": "Reasonably concerned for next 6 months",
                                                 "5.0": "Extremely concerned for next 6 months"}, inplace=True)

No Expense Responsibility
Student
No Expense Responsibility
Self-employed
No Expense Responsibility
Employed part time
No Expense Responsibility
Employed full time
No Expense Responsibility
Employed part time
No Expense Responsibility
Student
No Expense Responsibility
Self-employed
No Expense Responsibility
Student
No Expense Responsibility
Unemployed
No Expense Responsibility
Unemployed
No Expense Responsibility
Unemployed
No Expense Responsibility
Student
No Expense Responsibility
Unemployed
No Expense Responsibility
Student
No Expense Responsibility
Unemployed
No Expense Responsibility
Unemployed
No Expense Responsibility
Student
No Expense Responsibility
Unemployed
No Expense Responsibility
Student
No Expense Responsibility
Unemployed
No Expense Responsibility
Self-employed
No Expense Responsibility
Unemployed
No Expense Responsibility
Employed part time
No Expense Responsibility
Unemployed
No Expense Responsibility
Employed full time
No Expense Responsibility
Unemployed
No Expense

Student
No Expense Responsibility
Employed part time
No Expense Responsibility
Unemployed
No Expense Responsibility
Student
No Expense Responsibility
Self-employed
No Expense Responsibility
Student
No Expense Responsibility
Student
No Expense Responsibility
Student
No Expense Responsibility
Employed full time
No Expense Responsibility
Student
No Expense Responsibility
Unemployed
No Expense Responsibility
Student
No Expense Responsibility
Student
No Expense Responsibility
Self-employed
No Expense Responsibility
Self-employed
No Expense Responsibility
Unemployed
No Expense Responsibility
Unemployed
No Expense Responsibility
Student
No Expense Responsibility
Unemployed
No Expense Responsibility
Student
No Expense Responsibility
Self-employed
No Expense Responsibility
Student
No Expense Responsibility
Employed full time
No Expense Responsibility
Self-employed
No Expense Responsibility
Student
No Expense Responsibility
Unemployed
No Expense Responsibility
Student
No Expense Responsibility
U

In [19]:
# Monthly need column is dropped as it is an int type column
data_prepared = data_prepared.drop(columns=['MonthlyNeed'])

In [20]:
# Top priority and low priority columns dropped
data_prepared = data_prepared.drop(columns=['TopPriority','LowPriority'])

In [21]:
# Aid
# Check for missing values
data_prepared['Aid'].isnull().sum()
# Change column values to readable format, we are keeping same number of levels here
data_prepared["Aid"].replace({"No": "Aid Not Received",
                              "Yes": "Aid Received"}, inplace=True)

In [22]:
# Remove all aid sources columns
data_prepared = data_prepared.drop(columns=['AidSource_Aid Organisations','AidSource_Charities/ Donations',
                                            'AidSource_Friends/family','AidSource_Government','AidSource_Not sure',
                                            'AidSource_Other','AidSource_Religious Organizations','GovernmentPriority'])

In [23]:
#COVIDLoans
# Check for missing values
data_prepared['COVIDLoans'].isnull().sum()
# Change column values to readable format, we are keeping same number of levels here
data_prepared["COVIDLoans"].replace({"No": "No Loans taken due to Covid",
                              "Yes": "Loans taken due to Covid"}, inplace=True)

In [24]:
# MobileMoneyActivity
# Check for missing values
data_prepared['MobileMoneyActivity'].isnull().sum()
# Change column values to readable format, we are keeping same number of levels here
data_prepared["MobileMoneyActivity"].replace({"Decreased": "Mobile Money activity Decreased",
                                              "Increased": "Mobile Money activity Increased",
                                              "Stayed the same": "Mobile Money activity stayed the same",
                                              "Don’t know": "Unsure of Mobile Money Activity",
                                              "Don't know": "Unsure of Mobile Money Activity"}, inplace=True)

In [25]:
# MobileMoneyDeposit
# Check for missing values
data_prepared['MobileMoneyDeposit'].isnull().sum()
# Change column values to readable format, we are keeping same number of levels here
data_prepared["MobileMoneyDeposit"].replace({"No": "No mobile money deposit after Covid",
                                              "Yes": "There is mobile money deposit after Covid"}, inplace=True)

In [26]:
# convert data to numpy array
data_array = data_prepared.to_numpy()
# Transform into transactional data format for association
oht = TransactionEncoder()
oht_array = oht.fit(data_array).transform(data_array)
transform_df = pd.DataFrame(oht_array, columns= oht.columns_)
print(transform_df)

      15-25  18-25  26-35    36+  Aid Not Received  Aid Received  \
0      True  False  False  False              True         False   
1      True  False  False  False              True         False   
2     False  False  False   True              True         False   
3     False  False   True  False              True         False   
4      True  False  False  False              True         False   
...     ...    ...    ...    ...               ...           ...   
2495  False   True  False  False              True         False   
2496  False  False  False   True              True         False   
2497  False   True  False  False              True         False   
2498  False  False  False   True              True         False   
2499  False   True  False  False              True         False   

      Can Survive for 0-4 months  Can regain job  Can survive for 0-4 months  \
0                          False           False                       False   
1                      

In [27]:
# Generate frequent itemsets--Support 50%
frequent_itemsets = apriori(transform_df, min_support = 0.5, use_colnames=True)
print(frequent_itemsets)

    support                                           itemsets
0    0.8320                                 (Aid Not Received)
1    0.5020                                           (Female)
2    0.7600                       (Has Expense Responsibility)
3    0.8108                           (Income between 0-10000)
4    0.6948                        (Job regain-Not Applicable)
5    0.6032                  (Mobile Money activity Decreased)
6    0.5348  (More concerned about expenses than before covid)
7    0.5288                      (No Loans taken due to Covid)
8    0.5052        (There is mobile money deposit after Covid)
9    0.6388     (Has Expense Responsibility, Aid Not Received)
10   0.6704         (Aid Not Received, Income between 0-10000)
11   0.5780      (Job regain-Not Applicable, Aid Not Received)
12   0.5140  (Mobile Money activity Decreased, Aid Not Rece...
13   0.5980  (Has Expense Responsibility, Income between 0-...
14   0.5348  (Has Expense Responsibility, More concerne

In [28]:
# Generate rules--Confidence 50%
rules = association_rules(frequent_itemsets,metric='confidence',min_threshold=0.5)
print(rules)

                                          antecedents  \
0                        (Has Expense Responsibility)   
1                                  (Aid Not Received)   
2                                  (Aid Not Received)   
3                            (Income between 0-10000)   
4                         (Job regain-Not Applicable)   
5                                  (Aid Not Received)   
6                   (Mobile Money activity Decreased)   
7                                  (Aid Not Received)   
8                        (Has Expense Responsibility)   
9                            (Income between 0-10000)   
10                       (Has Expense Responsibility)   
11  (More concerned about expenses than before covid)   
12                        (Job regain-Not Applicable)   
13                           (Income between 0-10000)   
14      (Job regain-Not Applicable, Aid Not Received)   
15  (Job regain-Not Applicable, Income between 0-1...   
16         (Aid Not Received, I

In [29]:
# Generate frequent itemsets--Support 60%
frequent_itemsets = apriori(transform_df, min_support = 0.6, use_colnames=True)
print(frequent_itemsets)

   support                                           itemsets
0   0.8320                                 (Aid Not Received)
1   0.7600                       (Has Expense Responsibility)
2   0.8108                           (Income between 0-10000)
3   0.6948                        (Job regain-Not Applicable)
4   0.6032                  (Mobile Money activity Decreased)
5   0.6388     (Has Expense Responsibility, Aid Not Received)
6   0.6704         (Aid Not Received, Income between 0-10000)
7   0.6164  (Job regain-Not Applicable, Income between 0-1...


In [30]:
# Generate rules--Confidence 60%
rules = association_rules(frequent_itemsets,metric='confidence',min_threshold=0.6)
print(rules)

                    antecedents                   consequents  \
0  (Has Expense Responsibility)            (Aid Not Received)   
1            (Aid Not Received)  (Has Expense Responsibility)   
2            (Aid Not Received)      (Income between 0-10000)   
3      (Income between 0-10000)            (Aid Not Received)   
4   (Job regain-Not Applicable)      (Income between 0-10000)   
5      (Income between 0-10000)   (Job regain-Not Applicable)   

   antecedent support  consequent support  support  confidence      lift  \
0              0.7600              0.8320   0.6388    0.840526  1.010248   
1              0.8320              0.7600   0.6388    0.767788  1.010248   
2              0.8320              0.8108   0.6704    0.805769  0.993795   
3              0.8108              0.8320   0.6704    0.826838  0.993795   
4              0.6948              0.8108   0.6164    0.887162  1.094181   
5              0.8108              0.6948   0.6164    0.760237  1.094181   

   leverage

In [31]:
# Generate frequent itemsets--Support 70%
frequent_itemsets = apriori(transform_df, min_support = 0.7, use_colnames=True)
print(frequent_itemsets)

   support                      itemsets
0   0.8320            (Aid Not Received)
1   0.7600  (Has Expense Responsibility)
2   0.8108      (Income between 0-10000)


In [32]:
# Generate rules--Confidende 70%
rules = association_rules(frequent_itemsets,metric='confidence',min_threshold=0.7)
print(rules)

Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []


In [33]:
# Generate association rules by removing misisng values.
# Convert data into data frame
data_nonnull = pd.DataFrame(data = data)
# Drop all the columns that are not needed
data_nonnull = data_nonnull.drop(columns=['Country','Province','Age','BirthYear'])
data_nonnull = data_nonnull.drop(columns=['JobType','InformalWorker','Informal Work Type','OtherJob'])
data_nonnull = data_nonnull.drop(columns=['MonthlyIncome'])
data_nonnull = data_nonnull.drop(columns=['MonthlyNeed'])
data_nonnull = data_nonnull.drop(columns=['TopPriority','LowPriority'])
data_nonnull = data_nonnull.drop(columns=['AidSource_Aid Organisations','AidSource_Charities/ Donations',
                                            'AidSource_Friends/family','AidSource_Government','AidSource_Not sure',
                                            'AidSource_Other','AidSource_Religious Organizations','GovernmentPriority'])
# Drop all the observations with NaN values
data_nonnull = data_nonnull.dropna()
# Change column values
data_nonnull["JobLoss"].replace({"No-I’m still able to work": "Job Not lost due to Covid", 
                                  "Yes": "Job Lost due to Covid", 
                                  "Prefer not to say":"Prefer not stay about Job loss"}, inplace=True)
data_nonnull["JobRegain"].replace({"Don't know": "Not sure about job regain", 
                                    "No": "Cannot regain job", 
                                    "Yes":"Can regain job"}, inplace=True)
data_nonnull['MonthlyIncome Bracket'] = data_nonnull['MonthlyIncome Bracket'].apply(str)
data_nonnull["MonthlyIncome Bracket"].replace({"11.0": "Income between 0-10000",
                                         "1.0": "Income between 0-10000", 
                                         "2.0": "Income between 10001-20000",
                                         "3.0": "Income between 20001-50000",
                                         "4.0": "Income between 50001-100000",
                                         "5.0": "Income between 100001-200000",
                                         "6.0": "Income above 200000"}, inplace=True)
data_nonnull["IncomeChange"].replace({"Decreased a bit": "Income change decreased", 
                                                 "Decreased a lot": "Income change decreased",
                                                 "Increased a bit": "Income change increased",
                                                 "Increased a lot": "Income change increased",
                                                 "No change": "No change in income"}, inplace=True)
data_nonnull["ExpenseResponsibility"].replace({"No": "No Expense Responsibility", 
                                                "Yes": "Has Expense Responsibility"}, inplace=True)
data_nonnull["LengthSurvival"].replace({"Don’t know": "Unsure about length survival", 
                                         "Don’t know": "Unsure about length survival",
                                         "5+ months": "Can survive for 5+ months",
                                         "2-4 months": "Can survive for 0-4 months",
                                         "1 month": "Can Survive for 0-4 months",
                                         "< a month": "Can survive for 0-4 months"}, inplace=True)
data_nonnull["MoneyForExpenses"].replace({"Other": "Other way for expenses"}, inplace=True)
data_nonnull["ConcernExpenses"].replace({"Less concerned": "Less concerned about expenses than before covid",
                                          "More concerned": "More concerned about expenses than before covid",
                                          "No change": "No change in concern for expenses"}, inplace=True)
data_nonnull['Expense Concern Rating'] = data_nonnull['Expense Concern Rating'].apply(str)
data_nonnull["Expense Concern Rating"].replace({"1.0": "Not concerned for next 6 months",
                                                 "1.4": "Not concerned for next 6 months",
                                                 "2.0": "Reasonably concerned for next 6 months",
                                                 "3.0": "Reasonably concerned for next 6 months",
                                                 "4.0": "Reasonably concerned for next 6 months",
                                                 "5.0": "Extremely concerned for next 6 months"}, inplace=True)
data_nonnull["Aid"].replace({"No": "Aid Not Received",
                              "Yes": "Aid Received"}, inplace=True)
data_nonnull["COVIDLoans"].replace({"No": "No Loans taken due to Covid",
                              "Yes": "Loans taken due to Covid"}, inplace=True)
data_nonnull["MobileMoneyActivity"].replace({"Decreased": "Mobile Money activity Decreased",
                                              "Increased": "Mobile Money activity Increased",
                                              "Stayed the same": "Mobile Money activity stayed the same",
                                              "Don’t know": "Unsure of Mobile Money Activity",
                                              "Don't know": "Unsure of Mobile Money Activity"}, inplace=True)
data_nonnull["MobileMoneyDeposit"].replace({"No": "No mobile money deposit after Covid",
                                              "Yes": "There is mobile money deposit after Covid"}, inplace=True)

In [35]:
# Preview data
data_nonnull.head(5)

Unnamed: 0,Age Group,Gender,EmploymentType,JobLoss,JobRegain,MonthlyIncome Bracket,IncomeChange,ExpenseResponsibility,LengthSurvival,MoneyForExpenses,ConcernExpenses,Expense Concern Rating,Aid,COVIDLoans,MobileMoneyActivity,MobileMoneyDeposit
3,26-35,Female,Self-employed,Job Lost due to Covid,Can regain job,Income between 10001-20000,Income change decreased,Has Expense Responsibility,Can survive for 0-4 months,Savings,More concerned about expenses than before covid,Reasonably concerned for next 6 months,Aid Not Received,Loans taken due to Covid,Mobile Money activity Increased,There is mobile money deposit after Covid
10,26-35,Female,Self-employed,Job Lost due to Covid,Not sure about job regain,Income between 0-10000,Income change decreased,Has Expense Responsibility,Unsure about length survival,Loan/ Credit,No change in concern for expenses,Not concerned for next 6 months,Aid Not Received,Loans taken due to Covid,Mobile Money activity Decreased,There is mobile money deposit after Covid
14,26-35,Female,Employed full time,Job Lost due to Covid,Can regain job,Income between 0-10000,Income change decreased,Has Expense Responsibility,Can Survive for 0-4 months,Savings,Less concerned about expenses than before covid,Extremely concerned for next 6 months,Aid Not Received,Loans taken due to Covid,Mobile Money activity Increased,There is mobile money deposit after Covid
15,36+,Male,Self-employed,Job Lost due to Covid,Not sure about job regain,Income between 0-10000,Income change decreased,Has Expense Responsibility,Unsure about length survival,Loan/ Credit,More concerned about expenses than before covid,Extremely concerned for next 6 months,Aid Not Received,Loans taken due to Covid,Mobile Money activity Decreased,No mobile money deposit after Covid
16,26-35,Male,Self-employed,Job Lost due to Covid,Can regain job,Income between 10001-20000,Income change decreased,Has Expense Responsibility,Can Survive for 0-4 months,Loan/ Credit,More concerned about expenses than before covid,Reasonably concerned for next 6 months,Aid Not Received,Loans taken due to Covid,Mobile Money activity Decreased,There is mobile money deposit after Covid


In [36]:
# Convert data into numpy array
data_nonnull_array = data_nonnull.to_numpy()
# Convert data into transactional format used for association
oht = TransactionEncoder()
oht_nonnull_array = oht.fit(data_nonnull_array).transform(data_nonnull_array)
transform_df_nonnull = pd.DataFrame(oht_nonnull_array, columns= oht.columns_)
print(transform_df_nonnull)

     15-25  18-25  26-35    36+  Aid Not Received  Aid Received  \
0    False  False   True  False              True         False   
1    False  False   True  False              True         False   
2    False  False   True  False              True         False   
3    False  False  False   True              True         False   
4    False  False   True  False              True         False   
..     ...    ...    ...    ...               ...           ...   
656  False  False  False   True              True         False   
657  False  False   True  False             False          True   
658  False  False   True  False              True         False   
659  False  False   True  False              True         False   
660  False  False   True  False              True         False   

     Can Survive for 0-4 months  Can regain job  Can survive for 0-4 months  \
0                         False            True                        True   
1                         False      

In [37]:
# Generate frequent itemsets--suport 60%
frequent_itemsets = apriori(transform_df_nonnull, min_support = 0.6, use_colnames=True)
print(frequent_itemsets)

     support                                           itemsets
0   0.839637                                 (Aid Not Received)
1   0.620272            (Extremely concerned for next 6 months)
2   1.000000                       (Has Expense Responsibility)
3   0.650530                           (Income between 0-10000)
4   0.791225                          (Income change decreased)
5   1.000000                            (Job Lost due to Covid)
6   0.639939                  (Mobile Money activity Decreased)
7   0.723147  (More concerned about expenses than before covid)
8   0.839637     (Has Expense Responsibility, Aid Not Received)
9   0.671710        (Aid Not Received, Income change decreased)
10  0.839637          (Aid Not Received, Job Lost due to Covid)
11  0.614221  (Aid Not Received, More concerned about expens...
12  0.620272  (Has Expense Responsibility, Extremely concern...
13  0.620272  (Job Lost due to Covid, Extremely concerned fo...
14  0.650530  (Has Expense Responsibilit

In [38]:
# generate rules-- Confidence 60%
rules = association_rules(frequent_itemsets,metric='confidence',min_threshold=0.6)
print(rules)

                                           antecedents  \
0                         (Has Expense Responsibility)   
1                                   (Aid Not Received)   
2                                   (Aid Not Received)   
3                            (Income change decreased)   
4                                   (Aid Not Received)   
..                                                 ...   
113  (Job Lost due to Covid, More concerned about e...   
114                       (Has Expense Responsibility)   
115                                 (Aid Not Received)   
116                            (Job Lost due to Covid)   
117  (More concerned about expenses than before covid)   

                                           consequents  antecedent support  \
0                                   (Aid Not Received)            1.000000   
1                         (Has Expense Responsibility)            0.839637   
2                            (Income change decreased)            0.8

In [39]:
# Generate frequent itemsets--suport 70%
frequent_itemsets = apriori(transform_df_nonnull, min_support = 0.7, use_colnames=True)
print(frequent_itemsets)

     support                                           itemsets
0   0.839637                                 (Aid Not Received)
1   1.000000                       (Has Expense Responsibility)
2   0.791225                          (Income change decreased)
3   1.000000                            (Job Lost due to Covid)
4   0.723147  (More concerned about expenses than before covid)
5   0.839637     (Has Expense Responsibility, Aid Not Received)
6   0.839637          (Aid Not Received, Job Lost due to Covid)
7   0.791225  (Has Expense Responsibility, Income change dec...
8   1.000000  (Has Expense Responsibility, Job Lost due to C...
9   0.723147  (Has Expense Responsibility, More concerned ab...
10  0.791225   (Job Lost due to Covid, Income change decreased)
11  0.723147  (Job Lost due to Covid, More concerned about e...
12  0.839637  (Has Expense Responsibility, Aid Not Received,...
13  0.791225  (Has Expense Responsibility, Job Lost due to C...
14  0.723147  (Has Expense Responsibilit

In [40]:
# Generate rules-- confidence 70%
rules = association_rules(frequent_itemsets,metric='confidence',min_threshold=0.7)
print(rules)

                                          antecedents  \
0                        (Has Expense Responsibility)   
1                                  (Aid Not Received)   
2                                  (Aid Not Received)   
3                             (Job Lost due to Covid)   
4                        (Has Expense Responsibility)   
5                           (Income change decreased)   
6                        (Has Expense Responsibility)   
7                             (Job Lost due to Covid)   
8                        (Has Expense Responsibility)   
9   (More concerned about expenses than before covid)   
10                            (Job Lost due to Covid)   
11                          (Income change decreased)   
12                            (Job Lost due to Covid)   
13  (More concerned about expenses than before covid)   
14     (Has Expense Responsibility, Aid Not Received)   
15  (Has Expense Responsibility, Job Lost due to C...   
16          (Aid Not Received, 

In [41]:
# Generate frequent itemsets--suport 80%
frequent_itemsets = apriori(transform_df_nonnull, min_support = 0.8, use_colnames=True)
print(frequent_itemsets)

    support                                           itemsets
0  0.839637                                 (Aid Not Received)
1  1.000000                       (Has Expense Responsibility)
2  1.000000                            (Job Lost due to Covid)
3  0.839637     (Has Expense Responsibility, Aid Not Received)
4  0.839637          (Aid Not Received, Job Lost due to Covid)
5  1.000000  (Has Expense Responsibility, Job Lost due to C...
6  0.839637  (Has Expense Responsibility, Aid Not Received,...


In [42]:
# Generate rules-- Confidence 80%
rules = association_rules(frequent_itemsets,metric='confidence',min_threshold=0.8)
print(rules)

                                          antecedents  \
0                        (Has Expense Responsibility)   
1                                  (Aid Not Received)   
2                                  (Aid Not Received)   
3                             (Job Lost due to Covid)   
4                        (Has Expense Responsibility)   
5                             (Job Lost due to Covid)   
6      (Has Expense Responsibility, Aid Not Received)   
7   (Has Expense Responsibility, Job Lost due to C...   
8           (Aid Not Received, Job Lost due to Covid)   
9                        (Has Expense Responsibility)   
10                                 (Aid Not Received)   
11                            (Job Lost due to Covid)   

                                          consequents  antecedent support  \
0                                  (Aid Not Received)            1.000000   
1                        (Has Expense Responsibility)            0.839637   
2                          

In [43]:
# Generate frequent items sets--support 90%
frequent_itemsets = apriori(transform_df_nonnull, min_support = 0.9, use_colnames=True)
print(frequent_itemsets)

   support                                           itemsets
0      1.0                       (Has Expense Responsibility)
1      1.0                            (Job Lost due to Covid)
2      1.0  (Has Expense Responsibility, Job Lost due to C...


In [44]:
# Generate rules -- Confidenve 90% 
rules = association_rules(frequent_itemsets,metric='confidence',min_threshold=0.9)
print(rules)

                    antecedents                   consequents  \
0  (Has Expense Responsibility)       (Job Lost due to Covid)   
1       (Job Lost due to Covid)  (Has Expense Responsibility)   

   antecedent support  consequent support  support  confidence  lift  \
0                 1.0                 1.0      1.0         1.0   1.0   
1                 1.0                 1.0      1.0         1.0   1.0   

   leverage  conviction  
0       0.0         inf  
1       0.0         inf  


In [45]:
#Perform association on columns that are important from supervised learning and target variable
# MonthlyIncome Bracket', 'ConcernExpenses','ExpenseResponsibility',
# LengthSurvival','MoneyForExpenses'
data_cols = pd.DataFrame(data = data)
data_cols = data_cols.filter(['MonthlyIncome Bracket','ConcernExpenses','ExpenseResponsibility','LengthSurvival'
                             ,'MoneyForExpenses','Expense Concern Rating'])

In [46]:
# Drop all NA values
data_cols = data_cols.dropna()
# Columns to readable values
data_cols['MonthlyIncome Bracket'] = data_cols['MonthlyIncome Bracket'].apply(str)
data_cols["MonthlyIncome Bracket"].replace({"11.0": "Income between 0-10000",
                                         "1.0": "Income between 0-10000", 
                                         "2.0": "Income between 10001-20000",
                                         "3.0": "Income between 20001-50000",
                                         "4.0": "Income between 50001-100000",
                                         "5.0": "Income between 100001-200000",
                                         "6.0": "Income above 200000"}, inplace=True)
data_cols["ConcernExpenses"].replace({"Less concerned": "Less concerned about expenses than before covid",
                                          "More concerned": "More concerned about expenses than before covid",
                                          "No change": "No change in concern for expenses"}, inplace=True)
data_cols["ExpenseResponsibility"].replace({"No": "No Expense Responsibility", 
                                                "Yes": "Has Expense Responsibility"}, inplace=True)
data_cols["LengthSurvival"].replace({"Don’t know": "Unsure about length survival", 
                                         "Don’t know": "Unsure about length survival",
                                         "5+ months": "Can survive for 5+ months",
                                         "2-4 months": "Can survive for 0-4 months",
                                         "1 month": "Can Survive for 0-4 months",
                                         "< a month": "Can survive for 0-4 months"}, inplace=True)
data_cols["MoneyForExpenses"].replace({"Other": "Other way for expenses"}, inplace=True)
data_cols['Expense Concern Rating'] = data_prepared['Expense Concern Rating'].apply(str)
data_cols["Expense Concern Rating"].replace({"1.0": "Not concerned for next 6 months",
                                                 "1.4": "Not concerned for next 6 months",
                                                 "2.0": "Reasonably concerned for next 6 months",
                                                 "3.0": "Reasonably concerned for next 6 months",
                                                 "4.0": "Reasonably concerned for next 6 months",
                                                 "5.0": "Extremely concerned for next 6 months"}, inplace=True)
data_cols

Unnamed: 0,MonthlyIncome Bracket,ConcernExpenses,ExpenseResponsibility,LengthSurvival,MoneyForExpenses,Expense Concern Rating
2,Income between 0-10000,No change in concern for expenses,Has Expense Responsibility,Can survive for 0-4 months,Other way for expenses,Extremely concerned for next 6 months
3,Income between 10001-20000,More concerned about expenses than before covid,Has Expense Responsibility,Can survive for 0-4 months,Savings,Reasonably concerned for next 6 months
4,Income between 0-10000,No change in concern for expenses,Has Expense Responsibility,Unsure about length survival,Savings,Not concerned for next 6 months
7,Income between 0-10000,Less concerned about expenses than before covid,Has Expense Responsibility,Can survive for 0-4 months,Other way for expenses,Extremely concerned for next 6 months
10,Income between 0-10000,No change in concern for expenses,Has Expense Responsibility,Unsure about length survival,Loan/ Credit,Not concerned for next 6 months
...,...,...,...,...,...,...
2491,Income between 10001-20000,More concerned about expenses than before covid,Has Expense Responsibility,Can survive for 0-4 months,Loan/ Credit,Extremely concerned for next 6 months
2494,Income between 0-10000,More concerned about expenses than before covid,Has Expense Responsibility,Don't know,Loan/ Credit,Extremely concerned for next 6 months
2495,Income between 0-10000,More concerned about expenses than before covid,Has Expense Responsibility,Can survive for 0-4 months,Loan/ Credit,Extremely concerned for next 6 months
2496,Income between 50001-100000,More concerned about expenses than before covid,Has Expense Responsibility,Don't know,Salary/income,Extremely concerned for next 6 months


In [47]:
# Convert data to numpy array
data_cols_array = data_cols.to_numpy()
# Convert data to trnsactional format for performing association
oht = TransactionEncoder()
oht_data_cols_array = oht.fit(data_cols_array).transform(data_cols_array)
transform_data_cols = pd.DataFrame(oht_data_cols_array, columns= oht.columns_)
print(transform_data_cols)

      Can Survive for 0-4 months  Can survive for 0-4 months  \
0                          False                        True   
1                          False                        True   
2                          False                       False   
3                          False                        True   
4                          False                       False   
...                          ...                         ...   
1086                       False                        True   
1087                       False                       False   
1088                       False                        True   
1089                       False                       False   
1090                       False                        True   

      Can survive for 5+ months  Don't know  \
0                         False       False   
1                         False       False   
2                         False       False   
3                         False       False

In [48]:
# Generate frequent itemsets -- support 50%
frequent_itemsets_data_cols = apriori(transform_data_cols, min_support = 0.5, use_colnames=True)
print(frequent_itemsets_data_cols)

    support                                           itemsets
0  0.566453                       (Can survive for 0-4 months)
1  0.587534            (Extremely concerned for next 6 months)
2  1.000000                       (Has Expense Responsibility)
3  0.628781                           (Income between 0-10000)
4  0.696609  (More concerned about expenses than before covid)
5  0.566453  (Has Expense Responsibility, Can survive for 0...
6  0.587534  (Has Expense Responsibility, Extremely concern...
7  0.628781  (Has Expense Responsibility, Income between 0-...
8  0.696609  (Has Expense Responsibility, More concerned ab...


In [49]:
# Generate rules --Confidence 90%
rules_data_cols = association_rules(frequent_itemsets_data_cols,metric='confidence',min_threshold=0.5)
print(rules_data_cols)

                                         antecedents  \
0                       (Has Expense Responsibility)   
1                       (Can survive for 0-4 months)   
2                       (Has Expense Responsibility)   
3            (Extremely concerned for next 6 months)   
4                       (Has Expense Responsibility)   
5                           (Income between 0-10000)   
6                       (Has Expense Responsibility)   
7  (More concerned about expenses than before covid)   

                                         consequents  antecedent support  \
0                       (Can survive for 0-4 months)            1.000000   
1                       (Has Expense Responsibility)            0.566453   
2            (Extremely concerned for next 6 months)            1.000000   
3                       (Has Expense Responsibility)            0.587534   
4                           (Income between 0-10000)            1.000000   
5                       (Has Expense Re

In [50]:
# Generate frequent itemsets-- support 60%
frequent_itemsets_data_cols = apriori(transform_data_cols, min_support = 0.6, use_colnames=True)
print(frequent_itemsets_data_cols)

    support                                           itemsets
0  1.000000                       (Has Expense Responsibility)
1  0.628781                           (Income between 0-10000)
2  0.696609  (More concerned about expenses than before covid)
3  0.628781  (Has Expense Responsibility, Income between 0-...
4  0.696609  (Has Expense Responsibility, More concerned ab...


In [51]:
# Generate rules -- Confidence 60%
rules_data_cols = association_rules(frequent_itemsets_data_cols,metric='confidence',min_threshold=0.6)
print(rules_data_cols)

                                         antecedents  \
0                       (Has Expense Responsibility)   
1                           (Income between 0-10000)   
2                       (Has Expense Responsibility)   
3  (More concerned about expenses than before covid)   

                                         consequents  antecedent support  \
0                           (Income between 0-10000)            1.000000   
1                       (Has Expense Responsibility)            0.628781   
2  (More concerned about expenses than before covid)            1.000000   
3                       (Has Expense Responsibility)            0.696609   

   consequent support   support  confidence  lift  leverage  conviction  
0            0.628781  0.628781    0.628781   1.0       0.0         1.0  
1            1.000000  0.628781    1.000000   1.0       0.0         inf  
2            0.696609  0.696609    0.696609   1.0       0.0         1.0  
3            1.000000  0.696609    1.00000

In [53]:
# Generate frequent itemsets-- support 70%
frequent_itemsets_data_cols = apriori(transform_data_cols, min_support = 0.7, use_colnames=True)
print(frequent_itemsets_data_cols)

   support                      itemsets
0      1.0  (Has Expense Responsibility)


In [54]:
# Generate rules -- confidence 70%
rules_data_cols = association_rules(frequent_itemsets_data_cols,metric='confidence',min_threshold=0.7)
print(rules_data_cols)

Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []
