In [1]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

data_folder = './data'
min_support = 0.004

In [2]:
all_courses = pd.read_csv(f'{data_folder}/all_courses.csv')

print(all_courses.shape)
all_courses.head()

(307, 3)


Unnamed: 0,course_id,title,description
0,ML0201EN,robots are coming build iot apps with watson ...,have fun with iot and learn along the way if ...
1,ML0122EN,accelerating deep learning with gpu,training complex deep learning models with lar...
2,GPXX0ZG0EN,consuming restful services using the reactive ...,learn how to use a reactive jax rs client to a...
3,RP0105EN,analyzing big data in r using apache spark,apache spark is a popular cluster computing fr...
4,GPXX0Z2PEN,containerizing packaging and running a sprin...,learn how to containerize package and run a ...


In [3]:
ratings = pd.read_csv(f'{data_folder}/ratings.csv')
ratings = ratings[ratings['rating'] > 3]

print(ratings.shape)
ratings.head()

(179039, 3)


Unnamed: 0,user_id,course_id,rating
0,1889878,CC0101EN,4.0
2,1990814,ML0120ENv3,4.5
3,380098,BD0211EN,5.0
6,367075,DS0301EN,4.5
8,600100,BD0211EN,4.0


In [4]:
print(ratings['user_id'].nunique())
print(ratings['course_id'].nunique())
print(all_courses['course_id'].nunique())
print(set(ratings['course_id']) - set(all_courses['course_id']))

31944
136
307
set()


In [5]:
# transactions = ratings.groupby('user_id')['course_id'].apply(list).reset_index().rename(columns={'course_id': 'rated_courses'})
# transactions.to_pickle(f'{data_folder}/apriori_transactions.pkl')
transactions = pd.read_pickle(f'{data_folder}/apriori_transactions.pkl')

print(transactions.shape)
transactions.head()

(31944, 2)


Unnamed: 0,user_id,rated_courses
0,2,"[LB0105ENv1, PY0101EN, DA0101EN, BD0123EN, BD0..."
1,4,"[PY0101EN, BD0153EN, BD0101EN, BD0135EN, DS032..."
2,5,"[LB0107ENv1, ST0201EN, DB0151EN, DS0110EN, DS0..."
3,7,[BD0211EN]
4,8,"[BD0111EN, BD0212EN]"


In [6]:
rated_courses_list = transactions['rated_courses'].tolist()

# One-hot encode
te = TransactionEncoder()
te_ary = te.fit(rated_courses_list).transform(rated_courses_list)
encoded_transactions = pd.DataFrame(te_ary, columns=te.columns_)

print(encoded_transactions.shape)
encoded_transactions.head()

(31944, 136)


Unnamed: 0,AI0111EN,BC0101EN,BC0201EN,BC0202EN,BD0101EN,BD0111EN,BD0115EN,BD0121EN,BD0123EN,BD0131EN,...,WA0101EN,WA0103EN,excourse40,excourse46,excourse47,excourse52,excourse54,excourse69,excourse77,excourse84
0,False,True,False,False,True,True,False,False,True,True,...,True,False,False,False,False,False,False,False,False,False
1,False,False,False,False,True,True,True,True,True,True,...,False,True,False,False,False,False,False,False,False,False
2,False,True,False,False,False,False,False,False,True,False,...,False,True,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [7]:
# frequent_itemsets = apriori(encoded_transactions, min_support=min_support, use_colnames=True)
# frequent_itemsets.to_pickle(f'{data_folder}/apriori_frequent_itemsets.pkl')

frequent_itemsets = pd.read_pickle(f'{data_folder}/apriori_frequent_itemsets.pkl')

print(frequent_itemsets.shape)
frequent_itemsets.head()

(12475, 2)


Unnamed: 0,support,itemsets
0,0.009078,(AI0111EN)
1,0.16219,(BC0101EN)
2,0.069152,(BC0201EN)
3,0.018125,(BC0202EN)
4,0.317963,(BD0101EN)


In [8]:
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.2)
rules = rules[rules['lift'] > 2]

rules.to_pickle(f'{data_folder}/apriori_rules.pkl')
print(rules.shape)
rules.head()

(62138, 14)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(AI0111EN),(ML0101ENv3),0.009078,0.183759,0.005416,0.596552,3.24638,1.0,0.003747,2.023161,0.698304,0.028896,0.505724,0.313012
2,(BC0201EN),(BC0101EN),0.069152,0.16219,0.028425,0.411046,2.534346,1.0,0.017209,1.422538,0.650397,0.14008,0.297031,0.293151
3,(BC0202EN),(BC0101EN),0.018125,0.16219,0.011145,0.614853,3.790942,1.0,0.008205,2.1753,0.749804,0.065877,0.540293,0.341783
9,(CB0105ENv1),(BC0101EN),0.022539,0.16219,0.007732,0.343056,2.115145,1.0,0.004077,1.275313,0.539376,0.043686,0.215879,0.195365
14,(CC0250EN),(BC0101EN),0.011614,0.16219,0.004351,0.374663,2.310025,1.0,0.002468,1.339774,0.573768,0.025679,0.253605,0.200746
