# Last FM hometask <br>
https://www.kaggle.com/ravichaubey1506/lastfm <br>
1. Выбрать данные по странам своей группы (совместно): <br>
    3530203_70101: Germany, Netherlands <br>
    3530203_70102: Belarus, Ukraine, Poland, Russian Federation<br>
    3530903_70301: Sweden, Finland, Norway, Denmark, Iceland<br>
    3530903_70302: Spain, Portugal, France, Italy, Belgium<br>
    
2. Попытаться найти полезные с точки зрения продвижения групп (или еще чего-нибудь) и нетривиальные правила, используя алгоритмы Apriori, FPGrowth, FPMax и всевозможные метрики. Хотя бы 5 правил.
3. Вывести эти правила в отдельных ячейках. 
4. Подумать, как можно было бы использовать полученные правила.

In [1]:
import pandas as pd
import numpy as np

In [10]:
df = pd.read_csv("lastfm.csv")
df.head(10)


Unnamed: 0,user,artist,sex,country
0,1,red hot chili peppers,f,Germany
1,1,the black dahlia murder,f,Germany
2,1,goldfrapp,f,Germany
3,1,dropkick murphys,f,Germany
4,1,le tigre,f,Germany
5,1,schandmaul,f,Germany
6,1,edguy,f,Germany
7,1,jack johnson,f,Germany
8,1,eluveitie,f,Germany
9,1,the killers,f,Germany


#### отбираем данные из варианта

In [11]:
df = df.loc[df['country'].isin(['Belarus','Ukraine','Poland','Russian Federation'])]

#### нашли "неизвестного" исполнителя, удаляем строки с ним

In [20]:
df[df['artist']=='[unknown]'].count()

user       103
artist     103
sex        103
country    103
dtype: int64

In [21]:

df['artist'].replace('[unknown]',np.nan,inplace=True)
print(df.isna().sum().sum())
df.dropna(inplace=True)
df.isna().sum().sum()

103


0

#### группируем по юзеру, полу и стране артистов, то есть кто каких артистов слушает

In [22]:
gdf = df.groupby(['user','sex','country'])['artist'].apply(lambda x: '<~>'.join(x)).reset_index()
gdf

Unnamed: 0,user,sex,country,artist
0,35,m,Ukraine,radiohead<~>the kooks<~>coldplay<~>nine inch n...
1,39,m,Russian Federation,britney spears<~>a-ha<~>joss stone<~>christina...
2,50,f,Russian Federation,the ataris<~>yann tiersen<~>the smashing pumpk...
3,64,m,Poland,apparat<~>drowning pool<~>armin van buuren<~>a...
4,82,m,Russian Federation,p.o.d.<~>chimaira<~>arch enemy<~>scar symmetry...
...,...,...,...,...
2002,19703,f,Poland,the offspring<~>pearl jam<~>the smashing pumpk...
2003,19706,f,Russian Federation,enigma<~>pink floyd<~>the offspring<~>scorpions
2004,19710,f,Russian Federation,vnv nation<~>nine inch nails<~>kmfdm<~>apoptyg...
2005,19712,m,Poland,john williams<~>abba<~>vangelis<~>hans zimmer<...


#### создаем dummy таблицу для дальнейшей обработки

In [23]:
ddf = gdf['artist'].str.get_dummies('<~>')
ddf.head(10)

Unnamed: 0,...and you will know us by the trail of dead,2pac,3 doors down,30 seconds to mars,311,36 crazyfists,44,50 cent,65daysofstatic,a day to remember,...,weezer,wilco,within temptation,wolfgang amadeus mozart,wu-tang clan,yann tiersen,yeah yeah yeahs,yellowcard,zero 7,Édith piaf
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,1,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### импортируем алгоритмы и создаем суппорт через априори алгоритм (можно заметить, что действительно наиболее медленный)

In [24]:
from mlxtend.frequent_patterns import apriori, association_rules, fpgrowth

frequent_itemsets = apriori(ddf, 
                            min_support=0.01, use_colnames=True)

frequent_itemsets.sort_values('support',ascending=False)





Unnamed: 0,support,itemsets
308,0.129048,(metallica)
477,0.123568,(the prodigy)
392,0.122073,(red hot chili peppers)
385,0.114599,(radiohead)
101,0.112108,(coldplay)
...,...,...
742,0.010463,"(deftones, marilyn manson)"
646,0.010463,"(björk, the doors)"
554,0.010463,"(ac/dc, nirvana)"
732,0.010463,"(tool, coma)"


#### fpgrowth алгоритм

In [25]:
frequent_itemsets_fpg = fpgrowth(ddf, 
                            min_support=0.01, use_colnames=True)

frequent_itemsets_fpg.sort_values('support',ascending=False)

Unnamed: 0,support,itemsets
155,0.129048,(metallica)
108,0.123568,(the prodigy)
58,0.122073,(red hot chili peppers)
0,0.114599,(radiohead)
1,0.112108,(coldplay)
...,...,...
664,0.010463,"(30 seconds to mars, slipknot)"
658,0.010463,"(sum 41, blink-182)"
656,0.010463,"(red hot chili peppers, joy division)"
648,0.010463,"(sigur rós, muse)"


#### теперь правила по apriori, сортируем по lift
'lift': <br>
lift(A→C)= <br>
confidence(A→C)/support(C) <br>
,range: [0,∞] <br>
The lift metric is commonly used to measure how much more often the antecedent and consequent of a rule A->C occur together than we would expect if they were statistically independent. If A and C are independent, the Lift score will be exactly 1.
<br> <br>
В конкретной таблице видно, что наибольший лифт у тех правил, которые реже встречаются, но имеют хорошую уверенность. То есть у редких, но метких правил.<br>
Уже можно заметить, что первые два правила очень сильно зависят друг от друга, у них наибольший lift, люди, которые слушают первое, определенно будут слушать второе, и наоборот

In [30]:
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)
rules.sort_values('lift',ascending=False).head(5)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
33,"(paul van dyk, armin van buuren)",(above & beyond),0.016442,0.021923,0.010962,0.666667,30.409091,0.010601,2.93423
34,(above & beyond),"(paul van dyk, armin van buuren)",0.021923,0.016442,0.010962,0.5,30.409091,0.010601,1.967115
49,"(atb, armin van buuren)",(ferry corsten),0.017937,0.022422,0.010463,0.583333,26.016667,0.010061,2.346188
32,"(above & beyond, armin van buuren)",(paul van dyk),0.014449,0.030892,0.010962,0.758621,24.557286,0.010515,4.014877
3,(above & beyond),(ferry corsten),0.021923,0.022422,0.011958,0.545455,24.327273,0.011467,2.150673


#### правила по fpgrowth, так же сортировка по lift
(первые 2 строчки поменялись местами)

In [27]:
rules2 = association_rules(frequent_itemsets_fpg, metric="confidence", min_threshold=0.5)
rules2.sort_values('lift',ascending=False).head(5)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
69,(above & beyond),"(paul van dyk, armin van buuren)",0.021923,0.016442,0.010962,0.5,30.409091,0.010601,1.967115
68,"(paul van dyk, armin van buuren)",(above & beyond),0.016442,0.021923,0.010962,0.666667,30.409091,0.010601,2.93423
38,"(atb, armin van buuren)",(ferry corsten),0.017937,0.022422,0.010463,0.583333,26.016667,0.010061,2.346188
67,"(above & beyond, armin van buuren)",(paul van dyk),0.014449,0.030892,0.010962,0.758621,24.557286,0.010515,4.014877
63,(above & beyond),(ferry corsten),0.021923,0.022422,0.011958,0.545455,24.327273,0.011467,2.150673


Вот эти правила очень хорошие, они по встречаемости не самые редкие, уверенность приемлимая, зависимость хорошая. Для рекомендций хорошо подойдет <br>
Из за того, что лифт хороший, их можно пробовать рекомендовать в обе стороны

In [42]:
rules[ 
       (rules['lift'] > 10) 
#     &
#        (rules['confidence'] > 0.5) 
    &
       (rules['support'] > 0.014448)
     ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
1,(above & beyond),(armin van buuren),0.021923,0.038864,0.014449,0.659091,16.958916,0.013597,2.819332
7,(ferry corsten),(armin van buuren),0.022422,0.038864,0.015446,0.688889,17.725641,0.014575,3.089366
8,(paul van dyk),(armin van buuren),0.030892,0.038864,0.016442,0.532258,13.695409,0.015242,2.054843
9,(tiësto),(armin van buuren),0.030892,0.038864,0.015446,0.5,12.865385,0.014245,1.922272
18,(kaiser chiefs),(franz ferdinand),0.023418,0.056303,0.014449,0.617021,10.958953,0.013131,2.464098
37,"(akurat, happysad)",(pidżama porno),0.023916,0.050324,0.014449,0.604167,12.005569,0.013246,2.399182
38,"(akurat, pidżama porno)",(happysad),0.020927,0.053812,0.014449,0.690476,12.831349,0.013323,3.056916
39,"(happysad, pidżama porno)",(akurat),0.02292,0.048331,0.014449,0.630435,13.044151,0.013342,2.575105


Наиболее встречаемые правила, можно заметить, что уверенность у них тоже в порядке, но зависимость низкая, это значит, что рекомендация будет работать больше лишь в одну сторону

In [47]:
rules[ 
#        (rules['lift'] > 10) 
#     &
#        (rules['confidence'] > 0.5) 
#     &
       (rules['support'] > 0.016)
     ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(a perfect circle),(tool),0.03986,0.056303,0.022422,0.5625,9.990597,0.020177,2.157022
8,(paul van dyk),(armin van buuren),0.030892,0.038864,0.016442,0.532258,13.695409,0.015242,2.054843
12,(breaking benjamin),(linkin park),0.035874,0.107623,0.018435,0.513889,4.774884,0.014575,1.835746
19,(iron maiden),(metallica),0.058794,0.129048,0.031888,0.542373,4.202866,0.024301,1.903191
27,(tricky),(massive attack),0.031888,0.080219,0.017439,0.546875,6.817255,0.014881,2.029861


самые уверенные правила <br>
они не частые, это можно увидеть по их суппорту.

In [53]:
rules[ 
#        (rules['lift'] > 10) 
#     &
       (rules['confidence'] > 0.69) 
#     &
#        (rules['support'] > 0.016)
     ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
23,(t.i.),(kanye west),0.015944,0.041854,0.012456,0.78125,18.666295,0.011789,4.380098
31,"(above & beyond, paul van dyk)",(armin van buuren),0.012955,0.038864,0.010962,0.846154,21.772189,0.010458,6.247384
32,"(above & beyond, armin van buuren)",(paul van dyk),0.014449,0.030892,0.010962,0.758621,24.557286,0.010515,4.014877
38,"(akurat, pidżama porno)",(happysad),0.020927,0.053812,0.014449,0.690476,12.831349,0.013323,3.056916
47,"(ferry corsten, atb)",(armin van buuren),0.011958,0.038864,0.010463,0.875,22.514423,0.009999,7.689088


И еще можно увидеть, что встречаемость следствия выше, чем у причины. Это значит, что людей, слушающих менее популярных артистов интересуют наиболее популярные, и их больше, чем людей, которые могут слушать популярных и предпочитают наименее популярных.<br>
И у правил ниже, то есть обратных, самый высокий лифт, то есть они прям очень хорошо зависимы

In [64]:
rules[ 
#        (rules['lift'] > 10) 
#     &
#        (rules['confidence'] > 0.2) 
#     &
       (rules['antecedent support'] > rules['consequent support'])
     ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
4,(ferry corsten),(above & beyond),0.022422,0.021923,0.011958,0.533333,24.327273,0.011467,2.095879
34,(above & beyond),"(paul van dyk, armin van buuren)",0.021923,0.016442,0.010962,0.5,30.409091,0.010601,1.967115


можно рекомендовать популярных исполнителей

In [68]:
rules[ 
#        (rules['lift'] > 10) 
#     &
#        (rules['confidence'] > 0.2) 
#     &
       (rules['consequent support'] > 0.03)
     ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(a perfect circle),(tool),0.039860,0.056303,0.022422,0.562500,9.990597,0.020177,2.157022
1,(above & beyond),(armin van buuren),0.021923,0.038864,0.014449,0.659091,16.958916,0.013597,2.819332
2,(above & beyond),(atb),0.021923,0.048829,0.011460,0.522727,10.705241,0.010389,1.992930
5,(above & beyond),(paul van dyk),0.021923,0.030892,0.012955,0.590909,19.128299,0.012277,2.368931
6,(blank & jones),(armin van buuren),0.021923,0.038864,0.010962,0.500000,12.865385,0.010110,1.922272
...,...,...,...,...,...,...,...,...,...
67,"(placebo, nirvana)",(muse),0.022920,0.101644,0.012456,0.543478,5.346867,0.010127,1.967827
68,"(muse, nirvana)",(placebo),0.024415,0.102641,0.012456,0.510204,4.970775,0.009950,1.832108
69,"(placebo, red hot chili peppers)",(muse),0.020927,0.101644,0.011460,0.547619,5.387605,0.009333,1.985839
70,"(the killers, muse)",(placebo),0.022422,0.102641,0.011958,0.533333,5.196117,0.009657,1.922913


скорее всего для продвижения 

In [71]:
rules[ 
#        (rules['lift'] > 10) 
#     &
       (rules['antecedent support'] < 0.018) 
    &
       (rules['consequent support'] > 0.03)
     ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
23,(t.i.),(kanye west),0.015944,0.041854,0.012456,0.78125,18.666295,0.011789,4.380098
24,(king crimson),(pink floyd),0.017439,0.099153,0.010463,0.6,6.051256,0.008734,2.252118
31,"(above & beyond, paul van dyk)",(armin van buuren),0.012955,0.038864,0.010962,0.846154,21.772189,0.010458,6.247384
32,"(above & beyond, armin van buuren)",(paul van dyk),0.014449,0.030892,0.010962,0.758621,24.557286,0.010515,4.014877
40,"(akurat, red hot chili peppers)",(happysad),0.016442,0.053812,0.010463,0.636364,11.825758,0.009579,2.602018
41,"(red hot chili peppers, happysad)",(akurat),0.016442,0.048331,0.010463,0.636364,13.166823,0.009669,2.61709
46,"(arctic monkeys, radiohead)",(coldplay),0.016941,0.112108,0.010463,0.617647,5.509412,0.008564,2.32218
47,"(ferry corsten, atb)",(armin van buuren),0.011958,0.038864,0.010463,0.875,22.514423,0.009999,7.689088
48,"(ferry corsten, armin van buuren)",(atb),0.015446,0.048829,0.010463,0.677419,13.873272,0.009709,2.94863
58,"(myslovitz, pidżama porno)",(happysad),0.015446,0.053812,0.010463,0.677419,12.58871,0.009632,2.933184


In [138]:
rules2[ 
       (rules2['lift'] > 0.7) &
       (rules2['support'] > 0.016)
     ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(a perfect circle),(tool),0.03986,0.056303,0.022422,0.5625,9.990597,0.020177,2.157022
8,(paul van dyk),(armin van buuren),0.030892,0.038864,0.016442,0.532258,13.695409,0.015242,2.054843
12,(breaking benjamin),(linkin park),0.035874,0.107623,0.018435,0.513889,4.774884,0.014575,1.835746
19,(iron maiden),(metallica),0.058794,0.129048,0.031888,0.542373,4.202866,0.024301,1.903191
27,(tricky),(massive attack),0.031888,0.080219,0.017439,0.546875,6.817255,0.014881,2.029861
