# Data Structures:
- Lists
- Dicts
- Strings
- Tuples
- Sets

## Lists:

In [1]:
# Append:
my_list = ['d','b','c','b', 'a']
my_list.append(['d', 'e', 'f'])
my_list

['d', 'b', 'c', 'b', 'a', ['d', 'e', 'f']]

In [2]:
my_list[-1][1]

'e'

In [3]:
# Extend:
my_list = ['d','b','c','b', 'a']
my_list.extend(['d', 'e', 'f'])
my_list

['d', 'b', 'c', 'b', 'a', 'd', 'e', 'f']

In [4]:
# Sort:
sorted_list = sorted(my_list)
sorted_list

['a', 'b', 'b', 'c', 'd', 'd', 'e', 'f']

In [5]:
# Index:
my_list = ['d','b','c','b', 'a']
my_list.index('b')

1

In [6]:
# Slicing:
# list[firstIndex:lastIndex:step]
a = [11, 12, 13, 14, 15, 16, 17, 18]
print a[3:6]
print a[:5]
print a[:5:2]

[14, 15, 16]
[11, 12, 13, 14, 15]
[11, 13, 15]


In [7]:
# List Comprehension:
even = []
for i in a:
    if i % 2 == 0:
        even.append(i)
print even

[12, 14, 16, 18]


In [8]:
even2 = [i for i in a if i % 2 == 0]
print even2

[12, 14, 16, 18]


## Dicts:

In [9]:
population = {'Montreal':2, 'Delhi':19, 'New York':9, 'London':8}
print "Keys: ", population.keys()
print "Values: ", population.values()

Keys:  ['New York', 'Delhi', 'London', 'Montreal']
Values:  [9, 19, 8, 2]


In [10]:
for (key, val) in population.items():
    print key + "'s population: " + str(val) + " million."

New York's population: 9 million.
Delhi's population: 19 million.
London's population: 8 million.
Montreal's population: 2 million.


In [11]:
print population['London']

8


In [12]:
print population['SF']

KeyError: 'SF'

In [13]:
from collections import defaultdict
pop = defaultdict(int) # Default value is 0
pop['Montreal'] = 2
pop['Delhi'] = 19
print pop['Delhi']

19


In [14]:
# No more key errors:
print pop['SF']

0


In [15]:
# Could also use Lambda in defaultdict:
pop = defaultdict(lambda: 'IDK buddy')
pop['Montreal'] = 2
pop['Delhi'] = 19
print pop['SF']


# Lambda is for creating quick anonymous functions
f = lambda x,y: x+y
print f(4,6)

IDK buddy
10


## Sets:

In [16]:
a = set([1, 2, 3, 4])
b = set([3, 4, 5, 6])

In [17]:
print a.union(b)

set([1, 2, 3, 4, 5, 6])


In [18]:
inter = a.intersection(b)
print inter

set([3, 4])


In [19]:
print inter.issubset(a)

True


In [20]:
print a - b # a But Not b

set([1, 2])


#### Fun Facts:
- Ordering of Dictionaries and Sets is arbitrary, so you don't want to rely on them.
- Lists, Dicts, Sets were all mutable.
- Strings, Tuples are immutable.

## Tuples:

In [21]:
tup1 = (12, 18, 2385, 28, 5, 2, 17)
tup2 = ('abc', 'xyz')

print tup1[:5]

(12, 18, 2385, 28, 5)


In [22]:
# Tuples are immutable:
tup1[0] = 100

TypeError: 'tuple' object does not support item assignment

In [23]:
tup3 = tup1 + tup2
print tup3

(12, 18, 2385, 28, 5, 2, 17, 'abc', 'xyz')


In [24]:
# We usually combine these data structures: List of dictionaries, List of tuples etc.
# Tuples are usable as a dictionary keys, while lists are not.

## Strings:

In [25]:
mystr = "This is a dummy string"
print mystr.lower()
print mystr.upper()
print mystr.swapcase()

this is a dummy string
THIS IS A DUMMY STRING
tHIS IS A DUMMY STRING


In [26]:
print mystr.endswith('ing')

True


In [27]:
print mystr.startswith('This')

True


In [28]:
print mystr.find('is')            # returns start index of 'is' first occurence

2


In [29]:
print mystr.find('is', 4)         # starting at index 4, returns start index of 'is' first occurence

5


In [30]:
# Usually the text data has noise (extra whitespaces):
mystr = "    hey, what's up?   "
mystr

"    hey, what's up?   "

In [31]:
mystr.rstrip()

"    hey, what's up?"

In [32]:
mystr.lstrip()

"hey, what's up?   "

In [33]:
mystr.strip()

"hey, what's up?"

In [34]:
mystr = "This; is a; useful method"
str_list = mystr.split(';')
print str_list

['This', ' is a', ' useful method']


In [35]:
print ';'.join(str_list)

This; is a; useful method


## Pickling results:

In [36]:
# An easy way to store intermediate/final outputs. But only do it when your work is specific to Python.
# Cross-language compatibility is not guaranteed.
import pickle

path = './important.pickle'
with open(path, 'wb') as f:
    pickle.dump(str_list, f)

In [37]:
with open(path, 'rb') as f:
    a = pickle.load(f)
print a

['This', ' is a', ' useful method']


# Let's work through a Spam Detection problem:
- I uploaded a toy dataset here: https://github.com/sunyam/Python-Tutorial/tree/master/dataset

In [38]:
# Imports:
import os
import random

from nltk.tokenize import word_tokenize
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords

import sklearn.metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

In [39]:
# Load the dataset:
all_emails = []

def load_dataset(path):
    # Loading spam-emails:
    for email in os.listdir(path+'spam'):
        with open(path+'spam' + '/' + email, 'r') as f:
            content = f.read()
            all_emails.append((content, "spam"))

    # Loading ham-emails:
    for email in os.listdir(path+'ham'):
        f = open(path+'ham' + os.sep + email, 'r')
        content = f.read()
        all_emails.append((content, "ham"))
        f.close()

    print "Total number of emails: ", len(all_emails)

    
    
path = './dataset/'
# 286 Spam emails; 325 Ham emails.
load_dataset(path)

Total number of emails:  611


In [40]:
print all_emails[:2]

[('Subject: advs\r\ngreetings ,\r\ni am benedicta lindiwe hendricks ( mrs ) of rsa . i am writing\r\nthis letter to you with the hope that you will be kind enough\r\nto assist my family .\r\nif this means of communication is not acceptable to you please\r\naccept my apologies as it is the only available and resourceful\r\nmeans for me right now .\r\nmy children and i are in need of your assistance and we sincerely\r\npray and hope that you will be able to attend to our request .\r\nif there is the possibility that you will be able to help us do\r\nkindly let me know by return mail so that i can tell you about\r\nour humble request .\r\nthank for your understanding .\r\nbenedicta lindiwe hendricks ( mrs ) .\r\nplease reply to this email address ; heno 0 @ katamail . com', 'spam'), ('Subject: whats new in summer ? bawled\r\ncarolyn regretful watchfully procrustes godly\r\nsummer 2004 was too hot for the software manufacturers .\r\nno wonder ! as the prices were reduced in 3 - 4 times .\r

In [41]:
def cleanUp(text, custom_stopwords=[]):
    # Initilaise Lemmatizer object:
    lemm = WordNetLemmatizer()
    
    # Load NLTK stopwords:
    my_stopwords = stopwords.words('english') + custom_stopwords
    
    clean_text = ''
    
    words = word_tokenize(unicode(text, errors='ignore')) # word_tokenize() takes care of stripping too.
    
    for word in words:
        w = lemm.lemmatize(word.lower())
        if w not in my_stopwords and len(w)>2:
            clean_text += w + ' '
    
    return clean_text

In [42]:
clean_emails_with_labels = []
for (email, label) in all_emails:
    clean_email = cleanUp(email, ['subject'])
    clean_emails_with_labels.append((clean_email, label))

In [43]:
print clean_emails_with_labels[:2]

[(u'advs greeting benedicta lindiwe hendricks rsa writing letter hope kind enough assist family mean communication acceptable please accept apology available resourceful mean right child need assistance sincerely pray hope able attend request possibility able help kindly let know return mail tell humble request thank understanding benedicta lindiwe hendricks please reply email address heno katamail com ', 'spam'), (u'whats new summer bawled carolyn regretful watchfully procrustes godly summer 2004 hot software manufacturer wonder price reduced time caused software glut world market hand user able time update software possibility almost free charge read whole article year 2004 sotware price fall peter lemelman onerous reclaimers remunerate lounsbury dictate costed continued snooping digression rhine inseminate tilt instructs rejoice switchman stomaching hurtling brent gunner tortoise ', 'spam')]


In [44]:
random.shuffle(clean_emails_with_labels)

In [45]:
print clean_emails_with_labels[:2]

[(u'hello rich resume job service job small tel 408 482 2102 rysio yahoo com wiring installation hand electrical installation perform fitting mounting laying cable commercial industrial residential new existing building power supply light plug receptacle panel fuse box emergency generator wiring testing transformer power line conduit layout bending mounting parking lighting lamp switch post grocery story hardware story restaurant residential housing area computer business fast food unit installation building low voltage office home yard patio parking volt audio video equipment computer monitoring video control backup tape set mounting electro optical assembly subsystem power supply switch motion sensor alarm fire safety system install european standard fiber optic system plc setup motion management master control center machine process control computer dsl internet home net mail set firewall software hardware fax modem cable modem cable sat dish install help support solar project solar

In [46]:
# A great Jupyter feature: Shift+Tab to see function arguments + documentation.
# train_test_split

In [47]:
# We need to pass email-text only to the CountVectorizer
emails = []
labels = []

for (email, label) in clean_emails_with_labels:
    emails.append(email)
    labels.append(label)

#list(zip(*clean_emails_with_labels)[0]) # also gives you "emails"
#list(zip(*clean_emails_with_labels)[1]) # also gives you "labels"

print "Number of emails: ", len(emails)
print "Labels: ", len(labels)

Number of emails:  611
Labels:  611


In [48]:
# Split the dataset:
x_train, x_test, y_train, y_test = train_test_split(emails, labels, test_size=0.33, random_state=42)

In [49]:
# CountVectorizer:
vectorizer = CountVectorizer()

# Fit to our training data
X = vectorizer.fit_transform(x_train)

print X.shape
print len(y_train)

(409, 13251)
409


In [55]:
# Train Naive Bayes:
nb_detector = MultinomialNB()
nb_detector.fit(X, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [56]:
# Just randomly testing the model:
test = [u'article powerits online trading platform please find attached article european electricity report september 2001 kind regard sarimah black', 
        u'need help marriage hello vlgr professi nal per dose vlgr soft per dose generc vlgr per dose clls per dose clls soft per dose sinneth wrongeth soul hate love death son let depart thine eye keep sound wisdom discretion say hated instruction heart despised reproof house righteous much treasure revenue wicked trouble better little righteousness great revenue without right']

x_te = vectorizer.transform(test)
print nb_detector.predict(x_te)

['ham' 'spam']


In [57]:
# Testing properly with test-set:
X_test = vectorizer.transform(x_test)
y_pred = nb_detector.predict(X_test)
print X_test.toarray()
print y_pred

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
['ham' 'spam' 'ham' 'ham' 'spam' 'spam' 'spam' 'spam' 'spam' 'ham' 'ham'
 'spam' 'ham' 'spam' 'ham' 'spam' 'ham' 'ham' 'ham' 'spam' 'spam' 'ham'
 'ham' 'ham' 'spam' 'ham' 'ham' 'spam' 'ham' 'spam' 'ham' 'ham' 'spam'
 'spam' 'ham' 'spam' 'spam' 'ham' 'spam' 'ham' 'spam' 'ham' 'ham' 'ham'
 'spam' 'ham' 'spam' 'spam' 'ham' 'spam' 'spam' 'ham' 'spam' 'spam' 'ham'
 'ham' 'ham' 'ham' 'ham' 'spam' 'ham' 'spam' 'ham' 'spam' 'spam' 'spam'
 'spam' 'spam' 'ham' 'spam' 'spam' 'ham' 'spam' 'ham' 'ham' 'ham' 'spam'
 'ham' 'spam' 'ham' 'ham' 'ham' 'ham' 'spam' 'ham' 'spam' 'spam' 'spam'
 'ham' 'spam' 'spam' 'ham' 'ham' 'ham' 'ham' 'ham' 'spam' 'ham' 'ham'
 'ham' 'ham' 'ham' 'spam' 'ham' 'ham' 'spam' 'spam' 'spam' 'ham' 'spam'
 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'spam' 'spam' 'ham'
 'spam' 'spam' 'ham' 'spam' 'ham' 'spam' 'spam' 'ham' 'spam' 'spam' 'spam'
 'spam' 'ham' '

In [58]:
# Calculate Accuracy:
print "Accuracy: ", sklearn.metrics.accuracy_score(y_test, y_pred)

Accuracy:  0.9801980198019802


In [59]:
# True rows; Predicted Columns ["True" left; "Predict" top]
print sklearn.metrics.confusion_matrix(y_test, y_pred, labels=["spam", "ham"])

[[ 94   4]
 [  0 104]]
