In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('data/spam.csv', encoding='ISO-8859-1', header=0, usecols=[0,1], names=['spam', 'text'])
df['spam'] = np.where(df['spam'] == 'ham', 0, 1)
df.head()

Unnamed: 0,spam,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize.casual import casual_tokenize

tfidf = TfidfVectorizer(tokenizer=casual_tokenize)
tfidf_docs = tfidf.fit_transform(raw_documents=df.text).toarray()

print(tfidf_docs.shape)
print(df.spam.sum())
print(df.spam.sum()/tfidf_docs.shape[0])

# We have 5572 documents, a vocabulary of size 9187, and 747
# documents labeled as spam, representing 13% of all docs.

(5572, 9187)
747
0.13406317300789664


In [5]:
# We have nearly 5000 SMS messages in our dataset and only
# 13% of them are labeled as spam. So we have an unbalanced
# training set with about 8:1 "ham" (normal SMS messages)
# to "spam" (unwanted solicitations and advertisements).
# However, the vocabulary size, is even more problematic.
# We have more unique words in our vocabulary than we have
# SMS messages. That’s a recipe for over-fitting. So some
# dimension reduction/consolidation is definitely in order.
# That’s exactly what LSA is for.

from sklearn.decomposition import PCA
pca = PCA(n_components=16)
pca = pca.fit(tfidf_docs)
pca_topic_vectors = pca.transform(tfidf_docs)
pca_topic_df = pd.DataFrame(pca_topic_vectors,
                                  columns=['topic{}'.format(i) for i in range(16)])
pca_topic_df.head()

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9,topic10,topic11,topic12,topic13,topic14,topic15
0,0.201982,0.033972,0.005732,-0.000226,-0.014876,-0.052811,0.038975,-0.042265,-0.034425,0.036801,-0.071311,-0.053138,-0.000186,-0.0383,-0.000151,-0.054085
1,0.402178,-0.032359,-0.08086,0.07745,0.112747,0.061827,0.031459,0.056353,0.044572,0.020522,-0.010761,-0.00201,0.018991,-0.02492,0.056529,0.047418
2,-0.030515,0.060799,-0.053171,-0.099954,0.07895,-0.035817,0.003132,-0.035955,0.012645,-0.072781,-0.04989,0.105087,0.030264,-0.030442,-0.020456,-0.05393
3,0.32843,-0.029826,-0.030484,0.00916,0.055054,0.066053,-0.164622,-0.008178,-0.061865,0.089285,-0.0777,-0.023615,0.02306,-0.080385,0.023448,0.035705
4,0.003097,0.032948,0.032445,0.02045,-0.063115,-0.103995,-0.039177,0.015188,0.060371,-0.052217,-0.006654,0.023747,0.029637,-0.016783,-0.082498,-0.009058


In [8]:
# Use LDA to classify, based on topic vectors, if message is spammy or not.

from sklearn.cross_validation import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

xtrain, xtest, ytrain, ytest = train_test_split(pca_topic_vectors, df.spam, test_size=0.5, random_state=271828)
lda = LDA(n_components=1)
lda = lda.fit(xtrain, ytrain)
df['pca16_spam'] = lda.predict(pca_topic_vectors)
round(float(lda.score(xtest, ytest)), 3)

0.955