In [11]:
import os
import numpy as np
from collections import Counter
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.metrics import confusion_matrix


In [12]:
### Creating word dictionary
def make_Dictionary(train_dir):
    emails = [os.path.join(train_dir,f) for f in os.listdir(train_dir)]    
    all_words = []       
    for mail in emails:    
        with open(mail) as m:
            for i,line in enumerate(m):
                if i == 2:  #Body of email is only 3rd line of text file
                    words = line.split()
                    all_words += words
    
    dictionary = Counter(all_words)
    # Paste code for non-word removal here(code snippet is given below) 
    return dictionary

### Feature extraction process
def extract_features(mail_dir): 
    files = [os.path.join(mail_dir,fi) for fi in os.listdir(mail_dir)]
    features_matrix = np.zeros((len(files),3000))
    docID = 0;
    for fil in files:
        with open(fil) as fi:
            for i,line in enumerate(fi):
                if i == 2:
                    words = line.split()
                    for word in words:
                        wordID = 0
                        for i,d in enumerate(dictionary):
                            if d[0] == word:
                                wordID = i
                                features_matrix[docID,wordID] = words.count(word)
            docID = docID + 1     
    return features_matrix

In [14]:
# Create a dictionary of words with its frequency

train_dir = 'train-mails'
dictionary = make_Dictionary(train_dir)

list_to_remove = dictionary.keys()
for item in list(dictionary):
    if item.isalpha() == False:            #isalpha
        del dictionary[item]
    elif len(item) == 1:                   #length==1
        del dictionary[item]
dictionary = dictionary.most_common(3000)  #most_common
print (dictionary[:5])

# Prepare feature vectors per training mail and its labels

train_labels = np.zeros(702)
train_labels[351:701] = 1
train_matrix = extract_features(train_dir)

print (train_matrix.shape)

[('order', 1414), ('address', 1293), ('report', 1216), ('mail', 1127), ('send', 1079)]
(702, 3000)


In [39]:
# Training SVM and Naive bayes classifier

model1 = MultinomialNB()
model2 = LinearSVC()
model1.fit(train_matrix, train_labels)
model2.fit(train_matrix, train_labels)

# Test the unseen mails for Spam
test_dir = 'test-mails'
test_matrix = extract_features(test_dir)
test_labels = np.zeros(260)
test_labels[130:260] = 1

result1 = model1.predict(test_matrix)
result2 = model2.predict(test_matrix)
print (confusion_matrix(test_labels, result1))
print (confusion_matrix(test_labels, result2))

[[129   1]
 [  9 121]]
[[126   4]
 [  6 124]]


> Input的意義是否不同?????

### Finding Important Words in Text Using TF-IDF

In [47]:
import math
from textblob import TextBlob as tb

def tf(word, blob):
    return blob.words.count(word) / len(blob.words)

def n_containing(word, bloblist):
    return sum(1 for blob in bloblist if word in blob)

def idf(word, bloblist):
    return math.log(len(bloblist) / (1 + n_containing(word, bloblist)))

def tfidf(word, blob, bloblist):
    return tf(word, blob) * idf(word, bloblist)

In [78]:
document1 = tb("""Python is a 2000 made-for-TV horror movie directed by Richard
Clabaugh. The film features several cult favorite actors, including William
Zabka of The Karate Kid fame, Wil Wheaton, Casper Van Dien, Jenny McCarthy,
Keith Coogan, Robert Englund (best known for his role as Freddy Krueger in the
A Nightmare on Elm Street series of films), Dana Barron, David Bowe, and Sean
Whalen. The film concerns a genetically engineered snake, a python, that
escapes and unleashes itself on a small town. It includes the classic final
girl scenario evident in films like Friday the 13th. It was filmed in Los Angeles,
 California and Malibu, California. Python was followed by two sequels: Python
 II (2002) and Boa vs. Python (2004), both also made-for-TV films.""")

document2 = tb("""Python, from the Greek word (πύθων/πύθωνας), is a genus of
nonvenomous pythons[2] found in Africa and Asia. Currently, 7 species are
recognised.[2] A member of this genus, P. reticulatus, is among the longest
snakes known.""")

document3 = tb("""The Colt Python is a .357 Magnum caliber revolver formerly
manufactured by Colt's Manufacturing Company of Hartford, Connecticut.
It is sometimes referred to as a "Combat Magnum".[1] It was first introduced
in 1955, the same year as Smith &amp; Wesson's M29 .44 Magnum. The now discontinued
Colt Python targeted the premium revolver market segment. Some firearm
collectors and writers such as Jeff Cooper, Ian V. Hogg, Chuck Hawks, Leroy
Thompson, Renee Smeets and Martin Dougherty have described the Python as the
finest production revolver ever made.""")

In [50]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\stat_pc\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [51]:
bloblist = [document1, document2, document3]
for i, blob in enumerate(bloblist):
    print("Top words in document {}".format(i + 1))
    scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    for word, score in sorted_words[:3]:
        print("Word: {}, TF-IDF: {}".format(word, round(score, 5)))

Top words in document 1
Word: films, TF-IDF: 0.00997
Word: made-for-TV, TF-IDF: 0.00665
Word: film, TF-IDF: 0.00665
Top words in document 2
Word: genus, TF-IDF: 0.02192
Word: from, TF-IDF: 0.01096
Word: Greek, TF-IDF: 0.01096
Top words in document 3
Word: Colt, TF-IDF: 0.01367
Word: Magnum, TF-IDF: 0.01367
Word: revolver, TF-IDF: 0.01367


In [93]:
emails = [os.path.join(test_dir,f) for f in os.listdir(test_dir)]
bloblist = []
for mail in emails:    
    with open(mail) as m:
        for i,line in enumerate(m):
            if i == 2:  #Body of email is only 3rd line of text file
                line = tb(line)
                bloblist.append(line)


In [94]:
for i, blob in enumerate(bloblist):
    print("Top words in document {}".format(i + 1))
    scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    for word, score in sorted_words[:3]:
        print("Word: {}, TF-IDF: {}".format(word, round(score, 5)))

Top words in document 1
Word: speech, TF-IDF: 0.06997
Word: physiological, TF-IDF: 0.055
Word: gruyter, TF-IDF: 0.03801
Top words in document 2
Word: predication, TF-IDF: 0.16285
Word: secondary, TF-IDF: 0.12188
Word: gruyter, TF-IDF: 0.0982
Top words in document 3
Word: english, TF-IDF: 0.09043
Word: gruyter, TF-IDF: 0.06499
Word: preterite, TF-IDF: 0.06467
Top words in document 4
Word: 818-677, TF-IDF: 0.12588
Word: csun, TF-IDF: 0.12588
Word: northridge, TF-IDF: 0.08392
Top words in document 5
Word: yiddish, TF-IDF: 0.15565
Word: nanterre, TF-IDF: 0.06918
Word: juin, TF-IDF: 0.0566
Top words in document 6
Word: candidate, TF-IDF: 0.04947
Word: hiroshima, TF-IDF: 0.04947
Word: spanish, TF-IDF: 0.04809
Top words in document 7
Word: notice, TF-IDF: 0.14624
Word: board, TF-IDF: 0.08364
Word: linguistlist, TF-IDF: 0.07275
Top words in document 8
Word: benjamin, TF-IDF: 0.10275
Word: labov, TF-IDF: 0.05754
Word: volume, TF-IDF: 0.05368
Top words in document 9
Word: resijb, TF-IDF: 0.20281

Top words in document 71
Word: clite, TF-IDF: 0.08392
Word: szege, TF-IDF: 0.08392
Word: hungary, TF-IDF: 0.08392
Top words in document 72
Word: fluency, TF-IDF: 0.09184
Word: session, TF-IDF: 0.08584
Word: berkeley, TF-IDF: 0.06346
Top words in document 73
Word: discourse, TF-IDF: 0.11803
Word: teun, TF-IDF: 0.10065
Word: specialization, TF-IDF: 0.0671
Top words in document 74
Word: esca, TF-IDF: 0.081
Word: lpl, TF-IDF: 0.07761
Word: spontaneous, TF-IDF: 0.06653
Top words in document 75
Word: modal, TF-IDF: 0.1913
Word: modality, TF-IDF: 0.07889
Word: epistemic, TF-IDF: 0.0676
Top words in document 76
Word: discourse, TF-IDF: 0.11803
Word: teun, TF-IDF: 0.10065
Word: specialization, TF-IDF: 0.0671
Top words in document 77
Word: evaluation, TF-IDF: 0.06371
Word: scheme, TF-IDF: 0.04153
Word: carroll, TF-IDF: 0.03928
Top words in document 78
Word: sfr, TF-IDF: 0.12019
Word: minority, TF-IDF: 0.08305
Word: congress, TF-IDF: 0.08143
Top words in document 79
Word: nlp, TF-IDF: 0.07448
Wor

Word: capitalfm, TF-IDF: 0.0633
Word: album, TF-IDF: 0.0632
Word: lyric, TF-IDF: 0.04407
Top words in document 142
Word: webdomain, TF-IDF: 0.2296
Word: domain, TF-IDF: 0.16309
Word: easy-to, TF-IDF: 0.09184
Top words in document 143
Word: 38, TF-IDF: 0.25976
Word: xenical, TF-IDF: 0.23179
Word: drug, TF-IDF: 0.23179
Top words in document 144
Word: descrambler, TF-IDF: 0.15475
Word: cable, TF-IDF: 0.10893
Word: radar, TF-IDF: 0.05627
Top words in document 145
Word: prices, TF-IDF: 0.06954
Word: 228-2095, TF-IDF: 0.06954
Word: 822223281, TF-IDF: 0.06954
Top words in document 146
Word: capitalfm, TF-IDF: 0.06
Word: festival, TF-IDF: 0.05354
Word: album, TF-IDF: 0.04194
Top words in document 147
Word: report, TF-IDF: 0.04716
Word: program, TF-IDF: 0.02332
Word: business, TF-IDF: 0.02016
Top words in document 148
Word: microsoft, TF-IDF: 0.21951
Word: 2000, TF-IDF: 0.17351
Word: powerpoint, TF-IDF: 0.05795
Top words in document 149
Word: credit, TF-IDF: 0.14616
Word: wipe, TF-IDF: 0.1132
W

Word: valentine, TF-IDF: 0.07158
Word: music, TF-IDF: 0.05873
Word: backstage, TF-IDF: 0.05249
Top words in document 216
Word: travel, TF-IDF: 0.12169
Word: wordlwide, TF-IDF: 0.1132
Word: airline, TF-IDF: 0.09708
Top words in document 217
Word: bobby, TF-IDF: 0.46357
Word: immedaitly, TF-IDF: 0.23179
Word: cousin, TF-IDF: 0.19878
Top words in document 218
Word: nbsp, TF-IDF: 0.291
Word: ipcug, TF-IDF: 0.13888
Word: tufftest, TF-IDF: 0.08598
Top words in document 219
Word: showtime, TF-IDF: 0.05887
Word: brits, TF-IDF: 0.05507
Word: grammy, TF-IDF: 0.03853
Top words in document 220
Word: 95you, TF-IDF: 0.06205
Word: 20, TF-IDF: 0.05907
Word: a0, TF-IDF: 0.05321
Top words in document 221
Word: gift, TF-IDF: 0.39673
Word: somebody, TF-IDF: 0.25619
Word: pardon, TF-IDF: 0.2197
Top words in document 222
Word: photomask, TF-IDF: 0.51625
Word: adtek, TF-IDF: 0.295
Word: adtekphotomask, TF-IDF: 0.1475
Top words in document 223
Word: hollander, TF-IDF: 0.50354
Word: photography, TF-IDF: 0.1678