# Libraries

In [30]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.preprocessing import TransactionEncoder

# Pre-Processing

In [31]:
uncleaned_data = pd.read_csv('Outdoor Activities.csv')

In [32]:
# Removing unwanted columns
uncleaned_data = uncleaned_data.drop(["Timestamp", "Do you agree to become a respondent in this study?"], axis=1)

In [33]:
uncleaned_data.rename(columns = {
    
    "Current Region (Philippines)":"Region",

    "Based on the season, choose outdoor activities that you have enjoyed doing alone or with friends. (Select all that apply) [Swimming]":"Swimming",
    "Based on the season, choose outdoor activities that you have enjoyed doing alone or with friends. (Select all that apply) [Scuba Diving]":"Scuba Diving",
    "Based on the season, choose outdoor activities that you have enjoyed doing alone or with friends. (Select all that apply) [Surfing]":"Surfing",
    "Based on the season, choose outdoor activities that you have enjoyed doing alone or with friends. (Select all that apply) [Wakeboarding]":"Wakeboarding",
    "Based on the season, choose outdoor activities that you have enjoyed doing alone or with friends. (Select all that apply) [Boating]":"Boating",
    "Based on the season, choose outdoor activities that you have enjoyed doing alone or with friends. (Select all that apply) [Biking]":"Biking",
    "Based on the season, choose outdoor activities that you have enjoyed doing alone or with friends. (Select all that apply) [Hiking]":"Hiking",
    "Based on the season, choose outdoor activities that you have enjoyed doing alone or with friends. (Select all that apply) [Basketball]":"Basketball",
    "Based on the season, choose outdoor activities that you have enjoyed doing alone or with friends. (Select all that apply) [Volleyball]":"Volleyball",
    "Based on the season, choose outdoor activities that you have enjoyed doing alone or with friends. (Select all that apply) [Golfing]":"Golfing",
    "Based on the season, choose outdoor activities that you have enjoyed doing alone or with friends. (Select all that apply) [Sky Diving]":"Sky Diving",
    "Based on the season, choose outdoor activities that you have enjoyed doing alone or with friends. (Select all that apply) [Bungee Jumping]":"Bungee Jumping",
    "Based on the season, choose outdoor activities that you have enjoyed doing alone or with friends. (Select all that apply) [Tennis]":"Tennis",
    "Based on the season, choose outdoor activities that you have enjoyed doing alone or with friends. (Select all that apply) [Fishing]":"Fishing",
    "Based on the season, choose outdoor activities that you have enjoyed doing alone or with friends. (Select all that apply) [Camping]":"Camping",
    "Based on the season, choose outdoor activities that you have enjoyed doing alone or with friends. (Select all that apply) [Badminton]":"Badminton",
    "Based on the season, choose outdoor activities that you have enjoyed doing alone or with friends. (Select all that apply) [Sightseeing]":"Sightseeing",
    "Based on the season, choose outdoor activities that you have enjoyed doing alone or with friends. (Select all that apply) [Picnic]":"Picnic",
    "Based on the season, choose outdoor activities that you have enjoyed doing alone or with friends. (Select all that apply) [FoodTrip]":"FoodTrip",
    "Based on the season, choose outdoor activities that you have enjoyed doing alone or with friends. (Select all that apply) [Wildlife Viewing]":"Wildlife Viewing",

    "Thinking about the outdoor activities that you haven't tried; what keeps you from doing these activities?":"Preventing Forces"
    
}, inplace = True)

uncleaned_data.columns = uncleaned_data.columns.str.replace(r'\s+', '_', regex=True)

In [34]:
# Check column names
print(uncleaned_data.columns)

Index(['Age_Group', 'Gender', 'Region', 'Swimming', 'Scuba_Diving', 'Surfing',
       'Wakeboarding', 'Boating', 'Biking', 'Hiking', 'Basketball',
       'Volleyball', 'Golfing', 'Sky_Diving', 'Bungee_Jumping', 'Tennis',
       'Fishing', 'Camping', 'Badminton', 'Sightseeing', 'Picnic', 'FoodTrip',
       'Wildlife_Viewing', 'Preventing_Forces'],
      dtype='object')


In [35]:
# Notice how the symbol '-' in region is not actually a '-', but rather a '–'
# Replace all instances of '–' with regular '-'
uncleaned_data["Region"] = uncleaned_data["Region"].str.replace('–', '-')

In [36]:
data = uncleaned_data
data.head()

Unnamed: 0,Age_Group,Gender,Region,Swimming,Scuba_Diving,Surfing,Wakeboarding,Boating,Biking,Hiking,...,Bungee_Jumping,Tennis,Fishing,Camping,Badminton,Sightseeing,Picnic,FoodTrip,Wildlife_Viewing,Preventing_Forces
0,18 - 25 years,Male,CAR - Cordillera Administrative Region,Dry Season,Haven't Tried,Haven't Tried,Haven't Tried,Haven't Tried,Dry Season,Dry Season,...,Haven't Tried,Haven't Tried,Haven't Tried,Haven't Tried,Haven't Tried,Dry Season,Dry Season,Dry Season,Haven't Tried,"The activities are out of my budget, I don't h..."
1,18 - 25 years,Male,CAR - Cordillera Administrative Region,"Dry Season, Rainy Season",Haven't Tried,Haven't Tried,Haven't Tried,Haven't Tried,Dry Season,Dry Season,...,Dry Season,Haven't Tried,"Dry Season, Rainy Season",Dry Season,Dry Season,"Dry Season, Rainy Season","Dry Season, Rainy Season","Dry Season, Rainy Season",Haven't Tried,"I simply don't like the activities, The activi..."
2,18 - 25 years,Male,CAR - Cordillera Administrative Region,Dry Season,Dry Season,Haven't Tried,Haven't Tried,Dry Season,"Dry Season, Rainy Season",Dry Season,...,Haven't Tried,Dry Season,Haven't Tried,Dry Season,Dry Season,Dry Season,Dry Season,Dry Season,"Dry Season, Rainy Season","I simply don't like the activities, The activi..."
3,18 - 25 years,Female,CAR - Cordillera Administrative Region,"Dry Season, Rainy Season",Dry Season,Haven't Tried,Haven't Tried,Dry Season,Haven't Tried,Dry Season,...,Haven't Tried,Haven't Tried,Dry Season,Dry Season,Dry Season,"Dry Season, Rainy Season",Dry Season,"Dry Season, Rainy Season",Dry Season,I simply don't like the activities
4,18 - 25 years,Female,CAR - Cordillera Administrative Region,Dry Season,Haven't Tried,Haven't Tried,Haven't Tried,Dry Season,Dry Season,Haven't Tried,...,Haven't Tried,Haven't Tried,Haven't Tried,Dry Season,Dry Season,Dry Season,Dry Season,Dry Season,Haven't Tried,"The activities are out of my budget, Not enoug..."


In [37]:
# Group each columns - List
personal_data_colnames = ['Age_Group', 'Gender', 'Marital_Status', 'Employment_Status', 'Average Income/Allowance', 'Region']

pre_assessment_colnames = ['Participation_Type', 'Number_of_Members', 'Individual_Budget']

outdoor_activities_colnames = ['Swimming', 'Scuba_Diving', 'Surfing', 'Wakeboarding', 'Boating', 'Biking', 'Hiking', 'Basketball',
                           'Volleyball', 'Golfing', 'Sky_Diving', 'Bungee_Jumping', 'Tennis', 'Fishing', 'Camping', 'Badminton', 
                           'Sightseeing', 'Picnic', 'FoodTrip', 'Wildlife_Viewing']
                    
rating_colnames = ['Swimming_Rating', 'Scuba_Diving_Rating', 'Surfing_Rating', 'Wakeboarding_Rating', 'Boating_Rating',
                  'Biking_Rating', 'Hiking_Rating', 'Basketball_Rating', 'Volleyball_Rating', 'Golfing_Rating', 'Sky_Diving_Rating',
                  'Bungee_Jumping_Rating', 'Tennis_Rating', 'Fishing_Rating', 'Camping_Rating', 'Badminton_Rating', 
                  'Sightseeing_Rating', 'Picnic_Rating', 'FoodTrip_Rating', 'Wildlife_Viewing_Rating']

# Compiling All Outdoor Activities by Season

In [38]:
# This cell is to replicate ma'am's example

def determine_list_state(respo_list):
    if respo_list:
        return respo_list
    

dry_season_oa_dataset = []
rainy_season_oa_dataset = []
# ht = Haven't Tried
ht_oa_dataset = []

# 0 1 2
season_ht_list = ["Dry Season", "Rainy Season", "Haven't Tried"]

for row in data[outdoor_activities_colnames].itertuples():
    respo_ds_oa_answer_list = []
    respo_rs_oa_answer_list = []
    respo_ht_oa_answer_list = []

    for col_name in outdoor_activities_colnames:
        if season_ht_list[0] in getattr(row, col_name):
            respo_ds_oa_answer_list.append(col_name)
        if season_ht_list[1] in getattr(row, col_name):
            respo_rs_oa_answer_list.append(col_name)
        if season_ht_list[2] in getattr(row, col_name):
            respo_ht_oa_answer_list.append(col_name)
        
    dry_season_oa_dataset.append(respo_ds_oa_answer_list)
    rainy_season_oa_dataset.append(respo_rs_oa_answer_list)
    ht_oa_dataset.append(respo_ht_oa_answer_list)


# Remove empty lists inside list.
dry_season_oa_dataset = [x for x in dry_season_oa_dataset if x != []]
rainy_season_oa_dataset = [x for x in rainy_season_oa_dataset if x != []]
ht_oa_dataset = [x for x in ht_oa_dataset if x != []]
    
print(dry_season_oa_dataset[0:5])
print("\n")
print(rainy_season_oa_dataset[0:5])
print("\n")
print(ht_oa_dataset[0:5])

[['Swimming', 'Biking', 'Hiking', 'Basketball', 'Volleyball', 'Sightseeing', 'Picnic', 'FoodTrip'], ['Swimming', 'Biking', 'Hiking', 'Volleyball', 'Bungee_Jumping', 'Fishing', 'Camping', 'Badminton', 'Sightseeing', 'Picnic', 'FoodTrip'], ['Swimming', 'Scuba_Diving', 'Boating', 'Biking', 'Hiking', 'Basketball', 'Volleyball', 'Tennis', 'Camping', 'Badminton', 'Sightseeing', 'Picnic', 'FoodTrip', 'Wildlife_Viewing'], ['Swimming', 'Scuba_Diving', 'Boating', 'Hiking', 'Basketball', 'Volleyball', 'Golfing', 'Fishing', 'Camping', 'Badminton', 'Sightseeing', 'Picnic', 'FoodTrip', 'Wildlife_Viewing'], ['Swimming', 'Boating', 'Biking', 'Basketball', 'Volleyball', 'Camping', 'Badminton', 'Sightseeing', 'Picnic', 'FoodTrip']]


[['Swimming', 'Volleyball', 'Fishing', 'Sightseeing', 'Picnic', 'FoodTrip'], ['Biking', 'Wildlife_Viewing'], ['Swimming', 'Sightseeing', 'FoodTrip'], ['FoodTrip'], ['Biking', 'Hiking', 'Tennis', 'Fishing', 'Camping', 'Badminton', 'Sightseeing', 'FoodTrip', 'Wildlife_Viewing

In [39]:
# Determine minimum support for each season
print("Minimum Support for Dry Season O_A: {}".format(len(dry_season_oa_dataset)*60/100))
print("Minimum Support for Rainy Season O_A: {}".format(len(rainy_season_oa_dataset)*60/100))
print("Minimum Support for Haven't Tried O_A: {}".format(len(ht_oa_dataset)*60/100))

Minimum Support for Dry Season O_A: 61.8
Minimum Support for Rainy Season O_A: 38.4
Minimum Support for Haven't Tried O_A: 56.4


# Generate Frequent Itemsets and Support

In [40]:
# Create dataframe (name - dataset - minimum support)
dataset_names_list = ["Dry_Season_Dataset", "Rainy_Season_Dataset", "Havent_Tried_Dataset"]
datasets_list = [dry_season_oa_dataset, rainy_season_oa_dataset, ht_oa_dataset]
minimum_support_list = [0.606, 0.2, 0.3]

# Compile previous lists to dataframe
df_df = pd.DataFrame({"Dataset_Name":dataset_names_list, 
                      "Dataset":datasets_list, 
                      "Min_Support":minimum_support_list})
df_df

Unnamed: 0,Dataset_Name,Dataset,Min_Support
0,Dry_Season_Dataset,"[[Swimming, Biking, Hiking, Basketball, Volley...",0.606
1,Rainy_Season_Dataset,"[[Swimming, Volleyball, Fishing, Sightseeing, ...",0.2
2,Havent_Tried_Dataset,"[[Scuba_Diving, Surfing, Wakeboarding, Boating...",0.3


In [41]:
te_dataset_list = []
te_list = []

transformed_itemset_list = []

for row in df_df.itertuples():
    # Use transaction Encoder to transform the dataset into true or false values
    te = TransactionEncoder()
    dataset = row.Dataset
    oa_dataset = te.fit(dataset).transform(dataset)
    
    print("--------------------------------------------------")
    print("Dataset Name: {}".format(row.Dataset_Name))
    print("Minimum Support: {}".format(row.Min_Support))
    print("--------------------------------------------------")
    
    df_oa_dataset = pd.DataFrame(oa_dataset, columns=te.columns_)
    display(df_oa_dataset.head())
    
    transformed_itemset_list.append(df_oa_dataset)

--------------------------------------------------
Dataset Name: Dry_Season_Dataset
Minimum Support: 0.606
--------------------------------------------------


Unnamed: 0,Badminton,Basketball,Biking,Boating,Bungee_Jumping,Camping,Fishing,FoodTrip,Golfing,Hiking,Picnic,Scuba_Diving,Sightseeing,Sky_Diving,Surfing,Swimming,Tennis,Volleyball,Wakeboarding,Wildlife_Viewing
0,False,True,True,False,False,False,False,True,False,True,True,False,True,False,False,True,False,True,False,False
1,True,False,True,False,True,True,True,True,False,True,True,False,True,False,False,True,False,True,False,False
2,True,True,True,True,False,True,False,True,False,True,True,True,True,False,False,True,True,True,False,True
3,True,True,False,True,False,True,True,True,True,True,True,True,True,False,False,True,False,True,False,True
4,True,True,True,True,False,True,False,True,False,False,True,False,True,False,False,True,False,True,False,False


--------------------------------------------------
Dataset Name: Rainy_Season_Dataset
Minimum Support: 0.2
--------------------------------------------------


Unnamed: 0,Badminton,Basketball,Biking,Boating,Camping,Fishing,FoodTrip,Golfing,Hiking,Picnic,Scuba_Diving,Sightseeing,Sky_Diving,Surfing,Swimming,Tennis,Volleyball,Wakeboarding,Wildlife_Viewing
0,False,False,False,False,False,True,True,False,False,True,False,True,False,False,True,False,True,False,False
1,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
2,False,False,False,False,False,False,True,False,False,False,False,True,False,False,True,False,False,False,False
3,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False
4,True,False,True,False,True,True,True,False,True,False,False,True,False,False,False,True,False,False,True


--------------------------------------------------
Dataset Name: Havent_Tried_Dataset
Minimum Support: 0.3
--------------------------------------------------


Unnamed: 0,Badminton,Basketball,Biking,Boating,Bungee_Jumping,Camping,Fishing,FoodTrip,Golfing,Hiking,Picnic,Scuba_Diving,Sightseeing,Sky_Diving,Surfing,Tennis,Volleyball,Wakeboarding,Wildlife_Viewing
0,True,False,False,True,True,True,True,False,True,False,False,True,False,True,True,True,False,True,True
1,False,True,False,True,False,False,False,False,True,False,False,True,False,True,True,True,False,True,True
2,False,False,False,False,True,False,True,False,True,False,False,False,False,True,True,False,False,True,False
3,False,False,True,False,True,False,False,False,False,False,False,False,False,True,True,True,False,True,False
4,False,False,False,False,True,False,True,False,True,True,False,True,False,True,True,True,False,True,True


In [42]:
print("--------------------------------------------------")
print("Compiled Transformed Datasets")
print("--------------------------------------------------")

# Compile transaction encoder transformed dataframes
df_te = pd.DataFrame({"Dataset_Name":dataset_names_list, 
                      "Dataset":transformed_itemset_list, 
                      "Min_Support":minimum_support_list})
df_te

--------------------------------------------------
Compiled Transformed Datasets
--------------------------------------------------


Unnamed: 0,Dataset_Name,Dataset,Min_Support
0,Dry_Season_Dataset,Badminton Basketball Biking Boating B...,0.606
1,Rainy_Season_Dataset,Badminton Basketball Biking Boating Ca...,0.2
2,Havent_Tried_Dataset,Badminton Basketball Biking Boating Bu...,0.3


# APRIORI In preparation for Association Rules

In [43]:
frequent_itemsets_list = []

for row in df_te.itertuples():
    print("--------------------------------------------------")
    print("Dataset Name: {}".format(row.Dataset_Name))
    print("Length: {}".format(len(row.Dataset)))
    print("Minimum Support: {}".format(row.Min_Support))
    print("--------------------------------------------------")
    
    freq_itemsets = apriori(row.Dataset, min_support = row.Min_Support, use_colnames = True)
    display(freq_itemsets.sort_values(by=['support']).tail())
    
    frequent_itemsets_list.append(freq_itemsets)

--------------------------------------------------
Dataset Name: Dry_Season_Dataset
Length: 103
Minimum Support: 0.606
--------------------------------------------------


Unnamed: 0,support,itemsets
7,0.941748,(Picnic)
52,0.941748,"(Swimming, Picnic)"
5,0.961165,(FoodTrip)
45,0.961165,"(FoodTrip, Swimming)"
9,0.990291,(Swimming)


--------------------------------------------------
Dataset Name: Rainy_Season_Dataset
Length: 64
Minimum Support: 0.2
--------------------------------------------------


Unnamed: 0,support,itemsets
3,0.453125,(Camping)
7,0.46875,(Picnic)
32,0.578125,"(FoodTrip, Sightseeing)"
8,0.65625,(Sightseeing)
5,0.75,(FoodTrip)


--------------------------------------------------
Dataset Name: Havent_Tried_Dataset
Length: 94
Minimum Support: 0.3
--------------------------------------------------


Unnamed: 0,support,itemsets
30,0.787234,"(Golfing, Sky_Diving)"
3,0.797872,(Golfing)
17,0.904255,"(Sky_Diving, Bungee_Jumping)"
1,0.925532,(Bungee_Jumping)
5,0.968085,(Sky_Diving)


In [44]:
# Compile frequent itemsets
df_freq = pd.DataFrame({"Dataset_Name":dataset_names_list, 
                        "Dataset":frequent_itemsets_list,
                        "Min_Support":minimum_support_list})
df_freq

Unnamed: 0,Dataset_Name,Dataset,Min_Support
0,Dry_Season_Dataset,support ...,0.606
1,Rainy_Season_Dataset,support ...,0.2
2,Havent_Tried_Dataset,support ...,0.3


# Association Rules

In [84]:
for row in df_freq.itertuples():
    print("--------------------------------------------------")
    print("Dataset Name: {}".format(row.Dataset_Name))
    print("Length: {}".format(len(row.Dataset)))
    print("Minimum Support: {}".format(row.Min_Support))
    print("--------------------------------------------------")
    
    res = association_rules(row.Dataset, metric="confidence", min_threshold=0.85)
    display(res[res['confidence'] >= 0.85].sort_values(by = 'confidence'))

--------------------------------------------------
Dataset Name: Dry_Season_Dataset
Length: 515
Minimum Support: 0.606
--------------------------------------------------


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
2988,"(Biking, Swimming, Hiking)","(FoodTrip, Camping)",0.776699,0.815534,0.660194,0.85,1.042262,0.026770,1.229773
6332,"(FoodTrip, Swimming, Camping, Picnic)","(Sightseeing, Hiking, Badminton)",0.776699,0.728155,0.660194,0.85,1.167333,0.094637,1.812298
6347,"(FoodTrip, Camping, Picnic)","(Sightseeing, Swimming, Hiking, Badminton)",0.776699,0.728155,0.660194,0.85,1.167333,0.094637,1.812298
526,"(Biking, Hiking)","(Camping, Badminton)",0.776699,0.757282,0.660194,0.85,1.122436,0.072014,1.618123
4114,"(Biking, Hiking)","(Sightseeing, Camping, Badminton, Picnic)",0.776699,0.728155,0.660194,0.85,1.167333,0.094637,1.812298
...,...,...,...,...,...,...,...,...,...
5001,"(Biking, Picnic, Sightseeing, Camping, Boating)",(Swimming),0.631068,0.990291,0.631068,1.00,1.009804,0.006127,inf
4127,"(Biking, Camping, Hiking, Badminton)","(Swimming, Picnic)",0.660194,0.941748,0.660194,1.00,1.061856,0.038458,inf
2968,"(Biking, FoodTrip, Camping, Hiking)",(Sightseeing),0.660194,0.922330,0.660194,1.00,1.084211,0.051277,inf
3463,"(FoodTrip, Sightseeing, Boating, Picnic)",(Swimming),0.737864,0.990291,0.737864,1.00,1.009804,0.007164,inf


--------------------------------------------------
Dataset Name: Rainy_Season_Dataset
Length: 85
Minimum Support: 0.2
--------------------------------------------------


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
11,"(FoodTrip, Hiking)",(Biking),0.312500,0.43750,0.265625,0.850000,1.942857,0.128906,3.750000
5,(Wildlife_Viewing),(Sightseeing),0.328125,0.65625,0.281250,0.857143,1.306122,0.065918,2.406250
0,(Camping),(FoodTrip),0.453125,0.75000,0.390625,0.862069,1.149425,0.050781,1.812500
13,"(Biking, FoodTrip)",(Sightseeing),0.343750,0.65625,0.296875,0.863636,1.316017,0.071289,2.520833
49,"(Biking, Sightseeing, Camping)",(Picnic),0.234375,0.46875,0.203125,0.866667,1.848889,0.093262,3.984375
...,...,...,...,...,...,...,...,...,...
73,"(Camping, Hiking, Picnic)",(Sightseeing),0.218750,0.65625,0.218750,1.000000,1.523810,0.075195,inf
30,"(Sightseeing, Hiking)",(FoodTrip),0.281250,0.75000,0.281250,1.000000,1.333333,0.070312,inf
75,"(FoodTrip, Hiking, Picnic)",(Sightseeing),0.234375,0.65625,0.234375,1.000000,1.523810,0.080566,inf
28,"(Hiking, Picnic)",(FoodTrip),0.234375,0.75000,0.234375,1.000000,1.333333,0.058594,inf


--------------------------------------------------
Dataset Name: Havent_Tried_Dataset
Length: 256
Minimum Support: 0.3
--------------------------------------------------


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
240,"(Wildlife_Viewing, Golfing)","(Scuba_Diving, Bungee_Jumping)",0.425532,0.638298,0.361702,0.85,1.331667,0.090086,2.411348
242,"(Wildlife_Viewing, Bungee_Jumping)","(Golfing, Scuba_Diving)",0.425532,0.595745,0.361702,0.85,1.426786,0.108194,2.695035
854,"(Golfing, Fishing, Wakeboarding)","(Scuba_Diving, Sky_Diving, Bungee_Jumping)",0.425532,0.617021,0.361702,0.85,1.377586,0.099140,2.553191
241,"(Wildlife_Viewing, Scuba_Diving)","(Golfing, Bungee_Jumping)",0.425532,0.734043,0.361702,0.85,1.157971,0.049344,1.773050
89,"(Wildlife_Viewing, Bungee_Jumping)",(Wakeboarding),0.425532,0.734043,0.361702,0.85,1.157971,0.049344,1.773050
...,...,...,...,...,...,...,...,...,...
875,"(Bungee_Jumping, Golfing, Sky_Diving, Surfing,...",(Wakeboarding),0.340426,0.734043,0.340426,1.00,1.362319,0.090539,inf
876,"(Bungee_Jumping, Golfing, Surfing, Wakeboardin...",(Sky_Diving),0.340426,0.968085,0.340426,1.00,1.032967,0.010865,inf
879,"(Golfing, Fishing, Surfing, Bungee_Jumping)","(Sky_Diving, Wakeboarding)",0.340426,0.723404,0.340426,1.00,1.382353,0.094160,inf
893,"(Bungee_Jumping, Surfing, Wakeboarding, Fishin...",(Sky_Diving),0.340426,0.968085,0.340426,1.00,1.032967,0.010865,inf


# Preventing Forces Visuals

In [1]:
import pandas as pd

# Initiate a dictionary for the dataframe
data_dict = {
    "Store":[1, 2, 3, 4, 5, 6],
    "Store Space (sq m)":[5, 5, 10, 15, 15, 20],
    "Weekly sales (in thousands)":[160, 220, 190, 230, 270, 290]
}

# Create dataframe using the data_dict dictionary as its entry data
df = pd.DataFrame(data_dict)
df

Unnamed: 0,Store,Store Space (sq m),Weekly sales (in thousands)
0,1,5,160
1,2,5,220
2,3,10,190
3,4,15,230
4,5,15,270
5,6,20,290


In [5]:
# Separate dependent variable from independent variables
x = df["Store Space (sq m)"]
y = df["Weekly sales (in thousands)"]

print(x)
print(y)

0     5
1     5
2    10
3    15
4    15
5    20
Name: Store Space (sq m), dtype: int64
0    160
1    220
2    190
3    230
4    270
5    290
Name: Weekly sales (in thousands), dtype: int64


In [8]:
import numpy as np
print(np.corrcoef(x, y))

[[1.         0.84090909]
 [0.84090909 1.        ]]


In [25]:
import scipy

# Scipy library to contain slope, intercept, rvalue for the requirements
slope, intercept, rvalue, pvalue, stderr = scipy.stats.linregress(x, y)

print("y = {}x + {}".format(slope, intercept))
print("r^2 = {}".format(rvalue*rvalue))

y = 6.727272727272728x + 148.18181818181816
r^2 = 0.7071280991735538


In [10]:
from sklearn.metrics import r2_score

print(r2_score(y, x))

-23.44318181818182


In [18]:
import scipy
scipy.stats.linregress(x, y)

LinregressResult(slope=6.727272727272728, intercept=148.18181818181816, rvalue=0.8409090909090909, pvalue=0.035951587152516874, stderr=2.164705618143756, intercept_stderr=27.946229361707992)