# Libraries

In [21]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.preprocessing import TransactionEncoder

# Pre-Processing

In [13]:
uncleaned_data = pd.read_csv('Outdoor Activities.csv')

In [14]:
# Removing unwanted columns
uncleaned_data = uncleaned_data.drop(["Timestamp", "Do you agree to become a respondent in this study?"], axis=1)

In [15]:
uncleaned_data.rename(columns = {
    
    "Approximate average income / allowance per month (₱)":"Average Income/Allowance", "Current Region (Philippines)":"Region", 
    "Choose whether you participate in any outdoor activities alone or in a group.":"Participation Type", 
    "How many people in the group generally participate in outdoor activities? (Yourself Included)":"Number of Members",
    "Individual Budget. This includes travel, eating, and other areas (₱)":"Individual Budget",
    
    "Based on the season, choose outdoor activities that you have enjoyed doing alone or with friends. (Select all that apply) [Swimming]":"Swimming",
    "Based on the season, choose outdoor activities that you have enjoyed doing alone or with friends. (Select all that apply) [Scuba Diving]":"Scuba Diving",
    "Based on the season, choose outdoor activities that you have enjoyed doing alone or with friends. (Select all that apply) [Surfing]":"Surfing",
    "Based on the season, choose outdoor activities that you have enjoyed doing alone or with friends. (Select all that apply) [Wakeboarding]":"Wakeboarding",
    "Based on the season, choose outdoor activities that you have enjoyed doing alone or with friends. (Select all that apply) [Boating]":"Boating",
    "Based on the season, choose outdoor activities that you have enjoyed doing alone or with friends. (Select all that apply) [Biking]":"Biking",
    "Based on the season, choose outdoor activities that you have enjoyed doing alone or with friends. (Select all that apply) [Hiking]":"Hiking",
    "Based on the season, choose outdoor activities that you have enjoyed doing alone or with friends. (Select all that apply) [Basketball]":"Basketball",
    "Based on the season, choose outdoor activities that you have enjoyed doing alone or with friends. (Select all that apply) [Volleyball]":"Volleyball",
    "Based on the season, choose outdoor activities that you have enjoyed doing alone or with friends. (Select all that apply) [Golfing]":"Golfing",
    "Based on the season, choose outdoor activities that you have enjoyed doing alone or with friends. (Select all that apply) [Sky Diving]":"Sky Diving",
    "Based on the season, choose outdoor activities that you have enjoyed doing alone or with friends. (Select all that apply) [Bungee Jumping]":"Bungee Jumping",
    "Based on the season, choose outdoor activities that you have enjoyed doing alone or with friends. (Select all that apply) [Tennis]":"Tennis",
    "Based on the season, choose outdoor activities that you have enjoyed doing alone or with friends. (Select all that apply) [Fishing]":"Fishing",
    "Based on the season, choose outdoor activities that you have enjoyed doing alone or with friends. (Select all that apply) [Camping]":"Camping",
    "Based on the season, choose outdoor activities that you have enjoyed doing alone or with friends. (Select all that apply) [Badminton]":"Badminton",
    "Based on the season, choose outdoor activities that you have enjoyed doing alone or with friends. (Select all that apply) [Sightseeing]":"Sightseeing",
    "Based on the season, choose outdoor activities that you have enjoyed doing alone or with friends. (Select all that apply) [Picnic]":"Picnic",
    "Based on the season, choose outdoor activities that you have enjoyed doing alone or with friends. (Select all that apply) [FoodTrip]":"FoodTrip",
    "Based on the season, choose outdoor activities that you have enjoyed doing alone or with friends. (Select all that apply) [Wildlife Viewing]":"Wildlife Viewing",
    
    "Rate the outdoor activity from 1 to 5. (1 being the lowest; 5 being the highest) [Swimming]":"Swimming Rating",
    "Rate the outdoor activity from 1 to 5. (1 being the lowest; 5 being the highest) [Scuba Diving]":"Scuba Diving Rating",
    "Rate the outdoor activity from 1 to 5. (1 being the lowest; 5 being the highest) [Surfing]":"Surfing Rating",
    "Rate the outdoor activity from 1 to 5. (1 being the lowest; 5 being the highest) [Wakeboarding]":"Wakeboarding Rating",
    "Rate the outdoor activity from 1 to 5. (1 being the lowest; 5 being the highest) [Boating]":"Boating Rating",
    "Rate the outdoor activity from 1 to 5. (1 being the lowest; 5 being the highest) [Biking]":"Biking Rating",
    "Rate the outdoor activity from 1 to 5. (1 being the lowest; 5 being the highest) [Hiking]":"Hiking Rating",
    "Rate the outdoor activity from 1 to 5. (1 being the lowest; 5 being the highest) [Basketball]":"Basketball Rating",
    "Rate the outdoor activity from 1 to 5. (1 being the lowest; 5 being the highest) [Volleyball]":"Volleyball Rating",
    "Rate the outdoor activity from 1 to 5. (1 being the lowest; 5 being the highest) [Golfing]":"Golfing Rating",
    "Rate the outdoor activity from 1 to 5. (1 being the lowest; 5 being the highest) [Sky Diving]":"Sky Diving Rating",
    "Rate the outdoor activity from 1 to 5. (1 being the lowest; 5 being the highest) [Bungee Jumping]":"Bungee Jumping Rating",
    "Rate the outdoor activity from 1 to 5. (1 being the lowest; 5 being the highest) [Tennis]":"Tennis Rating",
    "Rate the outdoor activity from 1 to 5. (1 being the lowest; 5 being the highest) [Fishing]":"Fishing Rating",
    "Rate the outdoor activity from 1 to 5. (1 being the lowest; 5 being the highest) [Camping]":"Camping Rating",
    "Rate the outdoor activity from 1 to 5. (1 being the lowest; 5 being the highest) [Badminton]":"Badminton Rating",
    "Rate the outdoor activity from 1 to 5. (1 being the lowest; 5 being the highest) [Sightseeing]":"Sightseeing Rating",
    "Rate the outdoor activity from 1 to 5. (1 being the lowest; 5 being the highest) [Picnic]":"Picnic Rating",
    "Rate the outdoor activity from 1 to 5. (1 being the lowest; 5 being the highest) [FoodTrip]":"FoodTrip Rating",
    "Rate the outdoor activity from 1 to 5. (1 being the lowest; 5 being the highest) [Wildlife Viewing]":"Wildlife Viewing Rating",
    
    "Do you have anything more you like to do during the DRY SEASON in addition to the outdoor activities on the list? If Yes, please input exactly one (1) activity.":"Dry Season Addition",
    "Do you have more you like to do during the RAINY SEASON in addition to the outdoor activities on the list? If Yes, please input exactly one (1) activity.":"Rainy Season Addition",
    "Thinking about the outdoor activities that you haven't tried; what keeps you from doing these activities?":"Preventing Forces"
    
}, inplace = True)

uncleaned_data.columns = uncleaned_data.columns.str.replace(r'\s+', '_', regex=True)

In [16]:
# Check column names
print(uncleaned_data.columns)

Index(['Age_Group', 'Gender', 'Marital_Status', 'Employment_Status',
       'Average_Income/Allowance', 'Region', 'Participation_Type',
       'Number_of_Members', 'Individual_Budget', 'Swimming', 'Scuba_Diving',
       'Surfing', 'Wakeboarding', 'Boating', 'Biking', 'Hiking', 'Basketball',
       'Volleyball', 'Golfing', 'Sky_Diving', 'Bungee_Jumping', 'Tennis',
       'Fishing', 'Camping', 'Badminton', 'Sightseeing', 'Picnic', 'FoodTrip',
       'Wildlife_Viewing', 'Swimming_Rating', 'Scuba_Diving_Rating',
       'Surfing_Rating', 'Wakeboarding_Rating', 'Boating_Rating',
       'Biking_Rating', 'Hiking_Rating', 'Basketball_Rating',
       'Volleyball_Rating', 'Golfing_Rating', 'Sky_Diving_Rating',
       'Bungee_Jumping_Rating', 'Tennis_Rating', 'Fishing_Rating',
       'Camping_Rating', 'Badminton_Rating', 'Sightseeing_Rating',
       'Picnic_Rating', 'FoodTrip_Rating', 'Wildlife_Viewing_Rating',
       'Dry_Season_Addition', 'Rainy_Season_Addition', 'Preventing_Forces'],
      dtyp

In [17]:
# Notice how the symbol '-' in region is not actually a '-', but rather a '–'
# Replace all instances of '–' with regular '-'
uncleaned_data["Region"] = uncleaned_data["Region"].str.replace('–', '-')

In [18]:
data = uncleaned_data
data.head()

Unnamed: 0,Age_Group,Gender,Marital_Status,Employment_Status,Average_Income/Allowance,Region,Participation_Type,Number_of_Members,Individual_Budget,Swimming,...,Fishing_Rating,Camping_Rating,Badminton_Rating,Sightseeing_Rating,Picnic_Rating,FoodTrip_Rating,Wildlife_Viewing_Rating,Dry_Season_Addition,Rainy_Season_Addition,Preventing_Forces
0,18 - 25 years,Male,Single,Student,< 4999,CAR - Cordillera Administrative Region,With Both Friends and Relatives,5 - 10,1000 - 1999,Dry Season,...,Haven't Tried,Haven't Tried,Haven't Tried,3,4,5,Haven't Tried,,,The activities are out of my budget;I don't ha...
1,18 - 25 years,Male,Single,Student,Prefer not to disclose,Region III - Central Luzon,Solo,< 5,< 999,Dry Season;Rainy Season,...,1,2,1,3,3,2,Haven't Tried,,,I simply don't like the activities;The activit...
2,18 - 25 years,Male,Single,Student,10000 - 19999,CAR - Cordillera Administrative Region,With Both Friends and Relatives,5 - 10,1000 - 1999,Dry Season,...,Haven't Tried,4,4,5,5,5,5,,,I simply don't like the activities;The activit...
3,18 - 25 years,Female,Single,Student,10000 - 19999,CAR - Cordillera Administrative Region,With Friends,< 5,< 999,Dry Season;Rainy Season,...,3,5,4,5,5,5,5,,,I simply don't like the activities
4,18 - 25 years,Female,Single,Student,Prefer not to disclose,Region I - Ilocos Region,With Both Friends and Relatives,5 - 10,< 999,Dry Season,...,Haven't Tried,5,4,3,5,5,Haven't Tried,,,The activities are out of my budget;Not enough...


In [19]:
# Group each columns - List

personal_data_colnames = ['Age_Group', 'Gender', 'Marital_Status', 'Employment_Status', 'Average Income/Allowance', 'Region']

pre_assessment_colnames = ['Participation_Type', 'Number_of_Members', 'Individual_Budget']

outdoor_activities_colnames = ['Swimming', 'Scuba_Diving', 'Surfing', 'Wakeboarding', 'Boating', 'Biking', 'Hiking', 'Basketball',
                           'Volleyball', 'Golfing', 'Sky_Diving', 'Bungee_Jumping', 'Tennis', 'Fishing', 'Camping', 'Badminton', 
                           'Sightseeing', 'Picnic', 'FoodTrip', 'Wildlife_Viewing']
                    
rating_colnames = ['Swimming_Rating', 'Scuba_Diving_Rating', 'Surfing_Rating', 'Wakeboarding_Rating', 'Boating_Rating',
                  'Biking_Rating', 'Hiking_Rating', 'Basketball_Rating', 'Volleyball_Rating', 'Golfing_Rating', 'Sky_Diving_Rating',
                  'Bungee_Jumping_Rating', 'Tennis_Rating', 'Fishing_Rating', 'Camping_Rating', 'Badminton_Rating', 
                  'Sightseeing_Rating', 'Picnic_Rating', 'FoodTrip_Rating', 'Wildlife_Viewing_Rating']

# Compiling All Outdoor Activities by Season

In [20]:
# This cell is to replicate ma'am's example

def determine_list_state(respo_list):
    if respo_list:
        return respo_list
    

dry_season_oa_dataset = []
rainy_season_oa_dataset = []
# ht = Haven't Tried
ht_oa_dataset = []

# 0 1 2
season_ht_list = ["Dry Season", "Rainy Season"]

for row in data[outdoor_activities_colnames].itertuples():
    respo_ds_oa_answer_list = []
    respo_rs_oa_answer_list = []
    respo_ht_oa_answer_list = []

    for col_name in outdoor_activities_colnames:
        if season_ht_list[0] in getattr(row, col_name):
            respo_ds_oa_answer_list.append(col_name)
        elif season_ht_list[1] in getattr(row, col_name):
            respo_rs_oa_answer_list.append(col_name)
        else:
            respo_ht_oa_answer_list.append(col_name)
        
    dry_season_oa_dataset.append(respo_ds_oa_answer_list)
    rainy_season_oa_dataset.append(respo_rs_oa_answer_list)
    ht_oa_dataset.append(respo_ht_oa_answer_list)


# Remove empty lists inside list.
dry_season_oa_dataset = [x for x in dry_season_oa_dataset if x != []]
rainy_season_oa_dataset = [x for x in rainy_season_oa_dataset if x != []]
ht_oa_dataset = [x for x in ht_oa_dataset if x != []]
    
print(dry_season_oa_dataset)
print("\n")
print(rainy_season_oa_dataset)
print("\n")
print(ht_oa_dataset)

[['Swimming', 'Biking', 'Hiking', 'Basketball', 'Volleyball', 'Sightseeing', 'Picnic', 'FoodTrip'], ['Swimming', 'Biking', 'Hiking', 'Volleyball', 'Bungee_Jumping', 'Fishing', 'Camping', 'Badminton', 'Sightseeing', 'Picnic', 'FoodTrip'], ['Swimming', 'Scuba_Diving', 'Boating', 'Biking', 'Hiking', 'Basketball', 'Volleyball', 'Tennis', 'Camping', 'Badminton', 'Sightseeing', 'Picnic', 'FoodTrip', 'Wildlife_Viewing'], ['Swimming', 'Scuba_Diving', 'Boating', 'Hiking', 'Basketball', 'Volleyball', 'Golfing', 'Fishing', 'Camping', 'Badminton', 'Sightseeing', 'Picnic', 'FoodTrip', 'Wildlife_Viewing'], ['Swimming', 'Boating', 'Biking', 'Basketball', 'Volleyball', 'Camping', 'Badminton', 'Sightseeing', 'Picnic', 'FoodTrip'], ['Swimming', 'Boating', 'Biking', 'Hiking', 'Camping', 'Badminton', 'Sightseeing', 'Picnic'], ['Swimming', 'Scuba_Diving', 'Surfing', 'Wakeboarding', 'Boating', 'Hiking', 'Tennis', 'Camping', 'Sightseeing', 'Picnic', 'FoodTrip'], ['Swimming', 'Boating', 'Biking', 'Volleyball'

In [83]:
# Determine minimum support for each season
print("Minimum Support for Dry Season O_A: {}".format(len(dry_season_oa_dataset)*60/100))
print("Minimum Support for Rainy Season O_A: {}".format(len(rainy_season_oa_dataset)*60/100))

Minimum Support for Dry Season O_A: 60.6
Minimum Support for Rainy Season O_A: 6.6


In [136]:
te1 = TransactionEncoder()
te2 = TransactionEncoder()
te3 = TransactionEncoder()

te_dry_season_oa_dataset = te1.fit(dry_season_oa_dataset).transform(dry_season_oa_dataset)
te_rainy_season_oa_dataset = te2.fit(rainy_season_oa_dataset).transform(rainy_season_oa_dataset)

te_ht_oa_dataset = te3.fit(ht_oa_dataset).transform(ht_oa_dataset)

In [137]:
df_dry_season_oa_dataset = pd.DataFrame(te_dry_season_oa_dataset, columns=te1.columns_)
df_dry_season_oa_dataset

Unnamed: 0,Badminton,Basketball,Biking,Boating,Bungee_Jumping,Camping,Fishing,FoodTrip,Golfing,Hiking,Picnic,Scuba_Diving,Sightseeing,Sky_Diving,Surfing,Swimming,Tennis,Volleyball,Wakeboarding,Wildlife_Viewing
0,False,True,True,False,False,False,False,True,False,True,True,False,True,False,False,True,False,True,False,False
1,True,False,True,False,True,True,True,True,False,True,True,False,True,False,False,True,False,True,False,False
2,True,True,True,True,False,True,False,True,False,True,True,True,True,False,False,True,True,True,False,True
3,True,True,False,True,False,True,True,True,True,True,True,True,True,False,False,True,False,True,False,True
4,True,True,True,True,False,True,False,True,False,False,True,False,True,False,False,True,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,True,True,True,True,False,False,False,True,True,True,True,True,True,False,True,True,False,True,True,True
97,True,True,True,True,False,True,True,True,True,True,True,True,True,False,True,True,True,True,True,True
98,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
99,True,False,True,True,False,True,True,True,False,True,True,False,True,False,False,True,True,True,False,True


In [138]:
df_rainy_season_oa_dataset = pd.DataFrame(te_rainy_season_oa_dataset, columns=te2.columns_)
df_rainy_season_oa_dataset

Unnamed: 0,Badminton,Basketball,Biking,Boating,Camping,Fishing,FoodTrip,Hiking,Sightseeing,Swimming,Tennis,Volleyball,Wakeboarding,Wildlife_Viewing
0,False,False,False,False,False,False,True,False,False,False,False,False,False,False
1,True,False,True,False,False,True,False,False,False,False,False,False,False,True
2,False,False,False,False,False,True,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,True,False,False,False
4,True,True,False,False,False,False,False,False,False,False,True,True,False,False
5,False,False,False,False,False,False,False,False,True,True,False,False,True,False
6,False,False,False,False,True,True,False,False,False,False,False,False,False,False
7,False,False,False,True,False,False,False,False,False,False,False,False,False,False
8,False,False,False,False,False,False,True,False,False,False,False,False,False,False
9,True,False,False,False,False,False,False,True,False,False,False,False,False,False


# Association Rule Mining P1

In [139]:
freq_dryseason_itemsets = apriori(df_dry_season_oa_dataset, min_support = 0.606, use_colnames = True)
freq_dryseason_itemsets.sort_values(by=['support']).tail()

Unnamed: 0,support,itemsets
7,0.940594,(Picnic)
50,0.940594,"(Swimming, Picnic)"
43,0.960396,"(Swimming, FoodTrip)"
5,0.960396,(FoodTrip)
9,0.990099,(Swimming)


In [140]:
res_dryseason = association_rules(freq_dryseason_itemsets, metric="confidence", min_threshold=1)
res_dryseason[res_dryseason['confidence'] >= 1]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Badminton),(Swimming),0.881188,0.990099,0.881188,1.0,1.010000,0.008725,inf
1,(Boating),(Swimming),0.811881,0.990099,0.811881,1.0,1.010000,0.008038,inf
2,(Camping),(Swimming),0.841584,0.990099,0.841584,1.0,1.010000,0.008333,inf
3,(FoodTrip),(Swimming),0.960396,0.990099,0.960396,1.0,1.010000,0.009509,inf
4,(Hiking),(Swimming),0.841584,0.990099,0.841584,1.0,1.010000,0.008333,inf
...,...,...,...,...,...,...,...,...,...
464,"(Hiking, Badminton, Biking, FoodTrip, Sightsee...",(Picnic),0.643564,0.940594,0.643564,1.0,1.063158,0.038232,inf
465,"(Hiking, Badminton, Biking, Picnic, FoodTrip, ...","(Swimming, Sightseeing)",0.643564,0.930693,0.643564,1.0,1.074468,0.044603,inf
466,"(Hiking, Badminton, Biking, FoodTrip, Sightsee...","(Swimming, Picnic)",0.643564,0.940594,0.643564,1.0,1.063158,0.038232,inf
467,"(Hiking, Badminton, Biking, FoodTrip, Swimming...","(Sightseeing, Picnic)",0.643564,0.910891,0.643564,1.0,1.097826,0.057347,inf


# Association Rule Mining P2

In [141]:
freq_rainyseason_itemsets = apriori(df_rainy_season_oa_dataset, min_support = 0.09, use_colnames = True)
freq_rainyseason_itemsets.sort_values(by=['support']).tail()

Unnamed: 0,support,itemsets
10,0.181818,(Tennis)
6,0.181818,(FoodTrip)
2,0.181818,(Biking)
5,0.272727,(Fishing)
0,0.272727,(Badminton)


In [142]:
res_rainyseason = association_rules(freq_rainyseason_itemsets, metric="confidence", min_threshold=1)
res_rainyseason[res_rainyseason['confidence'] >= 1]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Basketball),(Badminton),0.090909,0.272727,0.090909,1.0,3.666667,0.066116,inf
1,(Hiking),(Badminton),0.090909,0.272727,0.090909,1.0,3.666667,0.066116,inf
2,(Volleyball),(Badminton),0.090909,0.272727,0.090909,1.0,3.666667,0.066116,inf
3,(Wildlife_Viewing),(Badminton),0.090909,0.272727,0.090909,1.0,3.666667,0.066116,inf
4,(Basketball),(Tennis),0.090909,0.181818,0.090909,1.0,5.500000,0.074380,inf
...,...,...,...,...,...,...,...,...,...
74,"(Badminton, Wildlife_Viewing)","(Biking, Fishing)",0.090909,0.090909,0.090909,1.0,11.000000,0.082645,inf
75,"(Biking, Fishing)","(Badminton, Wildlife_Viewing)",0.090909,0.090909,0.090909,1.0,11.000000,0.082645,inf
76,"(Biking, Wildlife_Viewing)","(Badminton, Fishing)",0.090909,0.090909,0.090909,1.0,11.000000,0.082645,inf
77,"(Fishing, Wildlife_Viewing)","(Badminton, Biking)",0.090909,0.090909,0.090909,1.0,11.000000,0.082645,inf
