# Frequent Pattern Mining

In [1]:
from pyspark.sql import SQLContext
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName("Frequent Pattern Mining")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

In [2]:
from pyspark.ml.fpm import FPGrowth

In [3]:

df=sqlContext.createDataFrame([
    (0, [1, 2, 5]),
    (1, [1, 2, 3, 5]),
    (2, [1, 2])
], ["id", "items"])



In [4]:
df.collect()

[Row(id=0, items=[1, 2, 5]),
 Row(id=1, items=[1, 2, 3, 5]),
 Row(id=2, items=[1, 2])]

In [5]:
fpGrowth = FPGrowth(itemsCol="items", minSupport=0.5, minConfidence=0.6)
model = fpGrowth.fit(df)

In [6]:
# Display frequent itemsets.
model.freqItemsets.show()

+---------+----+
|    items|freq|
+---------+----+
|      [1]|   3|
|      [2]|   3|
|   [2, 1]|   3|
|      [5]|   2|
|   [5, 2]|   2|
|[5, 2, 1]|   2|
|   [5, 1]|   2|
+---------+----+



In [7]:
# Display generated association rules.
model.associationRules.show()

+----------+----------+------------------+
|antecedent|consequent|        confidence|
+----------+----------+------------------+
|    [5, 2]|       [1]|               1.0|
|       [2]|       [1]|               1.0|
|       [2]|       [5]|0.6666666666666666|
|    [2, 1]|       [5]|0.6666666666666666|
|       [5]|       [2]|               1.0|
|       [5]|       [1]|               1.0|
|    [5, 1]|       [2]|               1.0|
|       [1]|       [2]|               1.0|
|       [1]|       [5]|0.6666666666666666|
+----------+----------+------------------+



In [8]:
# transform examines the input items against all the association rules and summarize the
# consequents as prediction
model.transform(df).show()

+---+------------+----------+
| id|       items|prediction|
+---+------------+----------+
|  0|   [1, 2, 5]|        []|
|  1|[1, 2, 3, 5]|        []|
|  2|      [1, 2]|       [5]|
+---+------------+----------+



In [9]:
new=sqlContext.createDataFrame([
    (4, [1])
], ["id", "items"])

model.transform(new).show()

+---+-----+----------+
| id|items|prediction|
+---+-----+----------+
|  4|  [1]|    [2, 5]|
+---+-----+----------+



# Applying Real Time data (supermarket)

In [10]:
!ls

Frequent Pattern Mining.ipynb  Market_Basket_Optimisation.csv


In [11]:
filename = "Market_Basket_Optimisation.csv"

In [12]:
items=[]
file = open(filename, "r")
for line in file:
    for i in line.split(","):
        items.append(i.rstrip())
        

In [13]:
id_to_name={}
for i,item in enumerate(set(items)):
    id_to_name[i]=item
print(id_to_name)

{0: 'champagne', 1: 'mayonnaise', 2: 'cider', 3: 'candy bars', 4: 'fresh bread', 5: 'nonfat milk', 6: 'red wine', 7: 'bacon', 8: 'pet food', 9: 'mint', 10: 'pancakes', 11: 'chili', 12: 'toothpaste', 13: 'tomato juice', 14: 'cottage cheese', 15: 'salmon', 16: 'white wine', 17: 'barbecue sauce', 18: 'chocolate', 19: 'parmesan cheese', 20: 'napkins', 21: 'strawberries', 22: 'ketchup', 23: 'carrots', 24: 'frozen vegetables', 25: 'magazines', 26: 'chicken', 27: 'french fries', 28: 'almonds', 29: 'avocado', 30: 'mushroom cream sauce', 31: 'pepper', 32: 'cream', 33: 'chutney', 34: 'cake', 35: 'corn', 36: 'burger sauce', 37: 'honey', 38: 'meatballs', 39: 'fresh tuna', 40: 'mint green tea', 41: 'eggs', 42: 'bug spray', 43: 'tea', 44: 'green grapes', 45: 'antioxydant juice', 46: 'sparkling water', 47: 'frozen smoothie', 48: 'bramble', 49: 'energy drink', 50: 'cooking oil', 51: 'dessert wine', 52: 'strong cheese', 53: 'oatmeal', 54: 'asparagus', 55: 'black tea', 56: 'mineral water', 57: 'energy b

In [14]:
name_to_id={}
for i,item in enumerate(set(items)):
    name_to_id[item]=i
print(name_to_id)

{'champagne': 0, 'mayonnaise': 1, 'cider': 2, 'candy bars': 3, 'fresh bread': 4, 'nonfat milk': 5, 'red wine': 6, 'bacon': 7, 'pet food': 8, 'mint': 9, 'pancakes': 10, 'chili': 11, 'toothpaste': 12, 'tomato juice': 13, 'cottage cheese': 14, 'salmon': 15, 'white wine': 16, 'barbecue sauce': 17, 'chocolate': 18, 'parmesan cheese': 19, 'napkins': 20, 'strawberries': 21, 'ketchup': 22, 'carrots': 23, 'frozen vegetables': 24, 'magazines': 25, 'chicken': 26, 'french fries': 27, 'almonds': 28, 'avocado': 29, 'mushroom cream sauce': 30, 'pepper': 31, 'cream': 32, 'chutney': 33, 'cake': 34, 'corn': 35, 'burger sauce': 36, 'honey': 37, 'meatballs': 38, 'fresh tuna': 39, 'mint green tea': 40, 'eggs': 41, 'bug spray': 42, 'tea': 43, 'green grapes': 44, 'antioxydant juice': 45, 'sparkling water': 46, 'frozen smoothie': 47, 'bramble': 48, 'energy drink': 49, 'cooking oil': 50, 'dessert wine': 51, 'strong cheese': 52, 'oatmeal': 53, 'asparagus': 54, 'black tea': 55, 'mineral water': 56, 'energy bar':

In [15]:
data=[]
unique=[]
file = open(filename, "r")
for i,line in enumerate(file):
    data.append([])
    for item in line.split(","):
        unique.append(name_to_id[item.rstrip()])
    for item in set(unique):        
        data[i].append(item)
    unique=[]
    
    

In [16]:
arr=[]
for i,line in enumerate(data):
   arr.append((i,line))

In [17]:
market=sqlContext.createDataFrame((arr), ["id", "items"])

In [18]:
market.collect()

[Row(id=0, items=[64, 37, 70, 71, 78, 44, 13, 110, 14, 79, 49, 76, 15, 116, 45, 86, 47, 56, 28, 29]),
 Row(id=1, items=[41, 92, 38]),
 Row(id=2, items=[33]),
 Row(id=3, items=[29, 87]),
 Row(id=4, items=[76, 113, 117, 56, 57]),
 Row(id=5, items=[79]),
 Row(id=6, items=[88, 27]),
 Row(id=7, items=[104, 89, 103]),
 Row(id=8, items=[24, 83, 76]),
 Row(id=9, items=[27]),
 Row(id=10, items=[8, 41]),
 Row(id=11, items=[58]),
 Row(id=12, items=[41, 50, 87, 56, 92]),
 Row(id=13, items=[0, 58, 83]),
 Row(id=14, items=[56, 15]),
 Row(id=15, items=[56]),
 Row(id=16, items=[37, 71, 105, 79, 18, 50, 26]),
 Row(id=17, items=[41, 87]),
 Row(id=18, items=[26, 39, 41, 15, 112, 83, 55, 87, 56, 90]),
 Row(id=19, items=[37, 38, 117, 118, 27]),
 Row(id=20, items=[6, 71, 41, 108, 80, 18, 31]),
 Row(id=21, items=[82, 46]),
 Row(id=22, items=[96, 10, 76, 83, 56, 63]),
 Row(id=23, items=[37, 71, 74, 12, 80, 16, 92, 29]),
 Row(id=24, items=[41]),
 Row(id=25, items=[4, 104, 83, 19, 117, 29]),
 Row(id=26, items=[

In [60]:
fpGrowth = FPGrowth(itemsCol="items", minSupport=0.01, minConfidence=0.3)
model = fpGrowth.fit(market)

In [61]:
# Display frequent itemsets.
model.freqItemsets.show()

+-------------+----+
|        items|freq|
+-------------+----+
|        [119]| 243|
|        [112]|  90|
|         [85]| 193|
|     [85, 56]|  77|
|         [24]| 715|
|     [24, 83]| 209|
| [24, 83, 56]|  90|
|     [24, 76]| 108|
|     [24, 18]| 172|
|    [24, 111]| 127|
|     [24, 41]| 163|
|     [24, 27]| 143|
|    [24, 117]| 177|
|[24, 117, 56]|  83|
|     [24, 56]| 268|
|        [118]| 139|
|         [26]| 450|
|     [26, 83]| 129|
|     [26, 76]|  89|
|     [26, 18]| 110|
+-------------+----+
only showing top 20 rows



In [62]:
# Display generated association rules.
model.associationRules.show()

+----------+----------+-------------------+
|antecedent|consequent|         confidence|
+----------+----------+-------------------+
|  [64, 56]|      [83]| 0.3719806763285024|
|  [10, 56]|      [83]|0.33992094861660077|
| [117, 56]|      [83]| 0.3277777777777778|
| [111, 83]|      [56]|0.43537414965986393|
|      [50]|      [83]|0.31070496083550914|
|      [50]|      [56]|0.39425587467362927|
|      [26]|      [56]|               0.38|
| [117, 41]|      [56]|0.42424242424242425|
|  [10, 83]|      [56]|  0.455026455026455|
|      [15]|      [83]| 0.3166144200626959|
|      [15]|      [56]| 0.4012539184952978|
| [111, 56]|      [83]| 0.4169381107491857|
|  [24, 83]|      [56]|  0.430622009569378|
|       [4]|      [56]|0.30959752321981426|
|      [37]|      [56]|0.31741573033707865|
|      [87]|      [41]|0.31130063965884863|
|      [87]|      [56]| 0.3070362473347548|
|  [83, 41]|      [56]| 0.3905109489051095|
|      [29]|      [56]|              0.348|
|      [83]|      [56]| 0.343032

In [63]:
model.transform(market).show()

+---+--------------------+----------+
| id|               items|prediction|
+---+--------------------+----------+
|  0|[64, 37, 70, 71, ...|      [83]|
|  1|        [41, 92, 38]|        []|
|  2|                [33]|        []|
|  3|            [29, 87]|  [41, 56]|
|  4|[76, 113, 117, 56...|      [83]|
|  5|                [79]|      [56]|
|  6|            [88, 27]|        []|
|  7|      [104, 89, 103]| [117, 56]|
|  8|        [24, 83, 76]|      [56]|
|  9|                [27]|        []|
| 10|             [8, 41]|        []|
| 11|                [58]|        []|
| 12|[41, 50, 87, 56, 92]|      [83]|
| 13|         [0, 58, 83]|      [56]|
| 14|            [56, 15]|      [83]|
| 15|                [56]|        []|
| 16|[37, 71, 105, 79,...|  [83, 56]|
| 17|            [41, 87]|      [56]|
| 18|[26, 39, 41, 15, ...|        []|
| 19|[37, 38, 117, 118...|      [56]|
+---+--------------------+----------+
only showing top 20 rows



In [64]:
new=sqlContext.createDataFrame([
    (4, [26])
], ["id", "items"])

result=model.transform(new)

In [65]:
result.show()

+---+-----+----------+
| id|items|prediction|
+---+-----+----------+
|  4| [26]|      [56]|
+---+-----+----------+

