### Data Mining Using Python

In [10]:
import csv

In [11]:
# For compatibility across multiple platforms
import os
IB = os.environ.get('INSTABASE_URI',None) is not None
open = ib.open if IB else open

In [12]:
# Read shopping dataset from CSV file
# Create dictionary "Sitems" with key = item and value = list of transactions
# Also set variable Snumtrans = number of transactions
Sitems = {}
trans = []  # list of transactions used to set Snumtrans
with open('Shop.csv','rU') as f:
    rows = csv.DictReader(f)
    for r in rows:
        if r['item'] not in Sitems:
            Sitems[r['item']] = [r['TID']]
        else:
            Sitems[r['item']].append(r['TID'])
        if r['TID'] not in trans:
            trans.append(r['TID'])
Snumtrans = len(trans)
print Snumtrans
print Sitems

5
{'juice': ['1', '2', '5'], 'eggs': ['1', '3', '4'], 'chips': ['3', '5'], 'milk': ['1', '2', '4', '5'], 'cookies': ['2', '5']}


In [13]:
# Read movies dataset from CSV file
# Create dictionary "Mitems" with key = item and value = list of transactions
# Also set variable Mnumtrans = number of transactions
Mitems = {}
trans = []  # list of transactions used to set Mnumtrans
with open('Movies.csv','rU') as f:
    rows = csv.DictReader(f)
    for r in rows:
        if r['item'] not in Mitems:
            Mitems[r['item']] = [r['TID']]
        else:
            Mitems[r['item']].append(r['TID'])
        if r['TID'] not in trans:
            trans.append(r['TID'])
Mnumtrans = len(trans)
print Mnumtrans
# print Mitems

1382


## Shopping dataset - frequent item-sets

#### First some Python features to be used

In [14]:
# Iterating through dictionaries
for i in Sitems:
    print i
    print Sitems[i]

juice
['1', '2', '5']
eggs
['1', '3', '4']
chips
['3', '5']
milk
['1', '2', '4', '5']
cookies
['2', '5']


In [15]:
# Intersecting lists
# How many transactions contain both eggs and milk?
list1 = Sitems['eggs']
print list1
list2 = Sitems['milk']
print list2
list3 = set(list1) & set(list2)
print list3
# add print len(list3)

['1', '3', '4']
['1', '2', '4', '5']
set(['1', '4'])


### Frequent item-sets of two

#### First compute all pairs of items and the number of transactions they occur together in (see what's wrong and fix it)

In [16]:
pairs = []
for i1 in Sitems:
    for i2 in Sitems:
        common = len(set(Sitems[i1]) & set(Sitems[i2]))
        pairs.append([i1, i2, common])
print pairs

[['juice', 'juice', 3], ['juice', 'eggs', 1], ['juice', 'chips', 1], ['juice', 'milk', 3], ['juice', 'cookies', 2], ['eggs', 'juice', 1], ['eggs', 'eggs', 3], ['eggs', 'chips', 1], ['eggs', 'milk', 2], ['eggs', 'cookies', 0], ['chips', 'juice', 1], ['chips', 'eggs', 1], ['chips', 'chips', 2], ['chips', 'milk', 1], ['chips', 'cookies', 1], ['milk', 'juice', 3], ['milk', 'eggs', 2], ['milk', 'chips', 1], ['milk', 'milk', 4], ['milk', 'cookies', 2], ['cookies', 'juice', 2], ['cookies', 'eggs', 0], ['cookies', 'chips', 1], ['cookies', 'milk', 2], ['cookies', 'cookies', 2]]


#### Print pairs that meet support threshold (see what's wrong and fix it)

In [17]:
support = .3
for p in pairs:
    if p[2]/Snumtrans > support:
        print p[0], '|', p[1]

In [18]:
# Fold previous two code boxes together into one program
WILL ADD CODE HERE

SyntaxError: invalid syntax (<ipython-input-18-d1ae2a5cb6b4>, line 2)

### Frequent item-sets of three

In [None]:
support = .3
for i1 in Sitems:
    for i2 in Sitems:
        for i3 in Sitems:
            if i1 < i2 and i2 < i3:
                common = len(set(Sitems[i1]) & set(Sitems[i2]) & set(Sitems[i3]))
                if float(common)/Snumtrans > support:
                    print i1, '|', i2, '|', i3

### Apriori algorithm: frequent item-sets of one, then two, then three

#### Frequent item-sets of one

In [None]:
support = .3
ones = []
for i in Sitems:
    if float(len(Sitems[i]))/Snumtrans > support:
        ones.append(i)
print ones

#### Frequent item-sets of two using only items from frequent item-sets of one

In [None]:
support = .3
twos = []
for i1 in ones:
    for i2 in ones:
        if i1 < i2:
            common = len(set(Sitems[i1]) & set(Sitems[i2]))
            if float(common)/Snumtrans > support:
                twos.append([i1, i2])
print twos

#### Frequent item-sets of three using only items from frequent item-sets of two and one (see what's wrong and fix it)

In [None]:
support = .3
for one in ones:
    for two in twos:
        if one not in two:
            common = len(set(Sitems[one]) & set(Sitems[two[0]]) & set(Sitems[two[1]]))
            if float(common)/Snumtrans > support:
                print one, '|', two[0], '|', two[1]

### <font color = 'green'>Your Turn - Movies dataset frequent item-sets</font>

In [None]:
print Mnumtrans
print len(Mitems)

#### Use the brute-force method to mine for frequent item-sets of three and four items in the Movies dataset. Find a single support threshold where the number of frequent item-sets of three items is more than 10 but less than 20, and the number of frequent item-sets of four items is more than 0. WATCH OUT! The calculation for item-sets of four can be quite slow.

In [None]:
# Frequent item-sets of three
support = INSERT VALUE HERE
YOUR CODE HERE

In [None]:
# Frequent item-sets of four
support = INSERT VALUE HERE (SAME VALUE AS IN PREVIOUS CELL)
YOUR CODE HERE

#### Now compute frequent item-sets of four using the Apriori method. Notice how incredibly much faster it is.

In [None]:
# Frequent item-sets of four using Apriori
support = INSERT VALUE HERE
YOUR CODE HERE

## Shopping dataset - association rules

### Association rules with one item on the left-hand side

#### First compute frequent item-sets of one item, as candidate left-hand sides of assocation rules. Include the number of transactions the items occur in.

In [None]:
support = .5
frequentLHS = []
for i in Sitems:
    if float(len(Sitems[i]))/Snumtrans > support:
        frequentLHS.append([i,len(Sitems[i])])
print frequentLHS

#### Now find right-hand side items with sufficient confidence (see what's wrong and fix it)

In [None]:
confidence = .5
for lhs in frequentLHS:
    for i in Sitems:
        common = len(set(Sitems[lhs[0]]) & set(Sitems[i]))
        if float(common)/lhs[1] > confidence:
            print lhs[0], '->', i

### Association rules with two items on the left-hand side

#### First compute frequent item-sets of two items, as candidate left-hand sides of assocation rules. Include the number of transactions the items occur in.

In [None]:
support = .5
frequentLHS = []
for i1 in Sitems:
    for i2 in Sitems:
        if i1 < i2:
            common = len(set(Sitems[i1]) & set(Sitems[i2]))
            if float(common)/Snumtrans > support:
                frequentLHS.append([i1,i2,common])
print frequentLHS

#### Now find right-hand side items with sufficient confidence

In [None]:
confidence = .5
for lhs in frequentLHS:
    for i in Sitems:
        if i not in lhs:
            common = len(set(Sitems[lhs[0]]) & set(Sitems[lhs[1]]) & set(Sitems[i]))
            if float(common)/lhs[2] > confidence:
                print lhs[0], '|', lhs[1], '->', i

## Shopping dataset - association rules with lift instead of confidence

### Association rules with one item on the left-hand side

#### First compute frequent item-sets of one item, as candidate left-hand sides of assocation rules. Include the number of transactions the items occur in.

In [None]:
support = .5
frequentLHS = []
for i in Sitems:
    if float(len(Sitems[i]))/Snumtrans > support:
        frequentLHS.append([i,len(Sitems[i])])
print frequentLHS

#### Now find right-hand side items yielding lift > 1

In [None]:
for lhs in frequentLHS:
    for i in Sitems:
        if lhs[0] <> i:
            common = len(set(Sitems[lhs[0]]) & set(Sitems[i]))
            lift = (float(common)/lhs[1]) / (float(len(Sitems[i]))/Snumtrans)
            if lift > 1:
                print lhs[0], '->', i, 'with lift', lift

### Association rules with two items on the left-hand side

#### First compute frequent item-sets of two items, as candidate left-hand sides of assocation rules. Include the number of transactions the items occur in.

In [None]:
support = .5
frequentLHS = []
for i1 in Sitems:
    for i2 in Sitems:
        if i1 < i2:
            common = len(set(Sitems[i1]) & set(Sitems[i2]))
            if float(common)/Snumtrans > support:
                frequentLHS.append([i1,i2,common])
print frequentLHS

#### Now find right-hand side items yielding lift > 1

In [None]:
confidence = .5
for lhs in frequentLHS:
    for i in Sitems:
        if i not in lhs:
            common = len(set(Sitems[lhs[0]]) & set(Sitems[lhs[1]]) & set(Sitems[i]))
            lift = (float(common)/lhs[2]) / (float(len(Sitems[i]))/Snumtrans)
            if lift > 1:
                print lhs[0], '|', lhs[1], '->', i, 'with lift', lift

### <font color = 'green'>Your Turn - Movies dataset association rules</font>

#### Mine for association rules in the Movies dataset with three items on the left-hand side. Find support and confidence thresholds (need not be the same) so the number of association rules is more than 10 but less than 20.


In [None]:
# Association rules with three items on the left-hand side
support = INSERT VALUE HERE
confidence = INSERT VALUE HERE
YOUR CODE HERE

#### Mine for association rules in the Movies dataset with three items on the left-hand side. Find support and lift thresholds so the number of association rules is more than 10 but less than 20.


In [None]:
# Association rules with three items on the left-hand side
support = INSERT VALUE HERE
liftthresh = INSERT VALUE HERE
YOUR CODE HERE