In [1]:
import csv
import numpy as np
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
data = np.array(list(csv.reader(open('spam.csv'))))
act_target = data[:,0]
text = data[:, 1]


In [7]:
spam_t = np.where(act_target == 'spam', 1, -1)

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(text)
print(count_vect.get_feature_names())
print(X_train_counts)

['2005', '21st', 'comp', 'cup', 'entry', 'fa', 'final', 'free', 'in', 'its', 'may', 'name', 'rofl', 'tkts', 'to', 'true', 'win', 'wkly']
  (0, 0)	1
  (0, 10)	1
  (0, 1)	1
  (0, 13)	1
  (0, 6)	1
  (0, 3)	1
  (0, 5)	1
  (0, 16)	1
  (0, 14)	1
  (0, 2)	1
  (0, 17)	1
  (0, 8)	1
  (0, 4)	1
  (0, 7)	1
  (1, 11)	1
  (1, 15)	1
  (1, 9)	2
  (1, 12)	1
  (1, 14)	1


In [4]:
tfid_transformer = TfidfTransformer()
X_train_tfidf = tfid_transformer.fit_transform(X_train_counts)
print(X_train_tfidf)

  (0, 7)	0.272102608304
  (0, 4)	0.272102608304
  (0, 8)	0.272102608304
  (0, 17)	0.272102608304
  (0, 2)	0.272102608304
  (0, 14)	0.193603246879
  (0, 16)	0.272102608304
  (0, 5)	0.272102608304
  (0, 3)	0.272102608304
  (0, 6)	0.272102608304
  (0, 13)	0.272102608304
  (0, 1)	0.272102608304
  (0, 10)	0.272102608304
  (0, 0)	0.272102608304
  (1, 14)	0.25969799324
  (1, 12)	0.364996468145
  (1, 9)	0.72999293629
  (1, 15)	0.364996468145
  (1, 11)	0.364996468145


In [9]:
clf = svm.SVC(kernel = 'linear')
clf.fit(X_train_tfidf, spam_t)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [17]:
#New instance
test_text = ['Fogg is going on', 'Free Nokia Camcorder delivery today', 'What else is going on']
X_new_counts = count_vect.transform(test_text)
X_new_tfidf = tfid_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)
for i in range(3):
    print('Text', test_text[i], '...Classification', "spam" if predicted[i] == 1 else "Ham")

Text Fogg is going on ...Classification spam
Text Free Nokia Camcorder delivery today ...Classification spam
Text What else is going on ...Classification spam
