-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathvscikit-learn.py
110 lines (85 loc) · 3.93 KB
/
vscikit-learn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import collections
import numpy as np
import util
from sklearn.naive_bayes import GaussianNB
def get_words(message):
"""Get the normalized list of words from a message string.
This function should split a message into words, normalize them, and return
the resulting list. For simplicity, you should split on whitespace, not
punctuation or any other character. For normalization, you should convert
everything to lowercase. Please do not consider the empty string (" ") to be a word.
Args:
message: A string containing an SMS message
Returns:
The list of normalized words from the message.
"""
words = [str(word).lower() for word in message.split(' ') if word.strip() != ""]
return words
def create_dictionary(messages):
"""Create a dictionary mapping words to integer indices.
This function should create a dictionary of word to indices using the provided
training messages. Use get_words to process each message.
Rare words are often not useful for modeling. Please only add words to the dictionary
if they occur in at least five messages.
Args:
messages: A list of strings containing SMS messages
Returns:
A python dict mapping words to integers.
"""
dictionary = {}
full_words = []
for message in messages:
words = get_words(message)
full_words = full_words + list(set(words))
index = 0
dict_counter = collections.Counter(full_words)
dict_counter_ordered = collections.OrderedDict(dict_counter.most_common())
for word in dict_counter_ordered:
if dict_counter_ordered[word] >= 5:
dictionary[word] = index
index = index + 1
return dictionary
def transform_text(messages, word_dictionary):
"""Transform a list of text messages into a numpy array for further processing.
This function should create a numpy array that contains the number of times each word
of the vocabulary appears in each message.
Each row in the resulting array should correspond to each message
and each column should correspond to a word of the vocabulary.
Use the provided word dictionary to map words to column indices. Ignore words that
are not present in the dictionary. Use get_words to get the words for a message.
Args:
messages: A list of strings where each string is an SMS message.
word_dictionary: A python dict mapping words to integers.
Returns:
A numpy array marking the words present in each message.
Where the component (i,j) is the number of occurrences of the
j-th vocabulary word in the i-th message.
"""
matrix = np.zeros((len(messages), len(word_dictionary)))
for row_i, message in enumerate(messages):
words = get_words(message)
dictionary = {}
for word in words:
if word not in dictionary:
dictionary[word] = 1
else:
dictionary[word] = dictionary[word] + 1
for word in words:
if word in word_dictionary:
col_i = word_dictionary[word]
matrix[row_i][col_i] = dictionary[word]
return matrix
def main():
train_messages, train_labels = util.load_spam_dataset('spam_train.tsv')
test_messages, test_labels = util.load_spam_dataset('spam_test.tsv')
dictionary = create_dictionary(train_messages)
print('Size of dictionary: ', len(dictionary))
train_matrix = transform_text(train_messages, dictionary)
test_matrix = transform_text(test_messages, dictionary)
gnb = GaussianNB()
naive_bayes_predictions = gnb.fit(train_matrix, train_labels).predict(test_matrix)
np.savetxt('vscikit-learn_spam_naive_bayes_predictions', naive_bayes_predictions)
naive_bayes_accuracy = np.mean(naive_bayes_predictions == test_labels)
print('Naive Bayes had an accuracy of {} on the testing set'.format(naive_bayes_accuracy))
if __name__ == "__main__":
main()