In [1]:
import pandas as pd
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
url = 'C:\\Users\\UttamSinha\\spam.csv'
messages = pd.read_csv(url,encoding='latin-1')
messages = messages.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
messages= messages.rename(columns={"v1":"label", "v2":"message"})
messages.head(5)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
from pandas.plotting import scatter_matrix
print(messages.describe())

       label                 message
count   5572                    5572
unique     2                    5169
top      ham  Sorry, I'll call later
freq    4825                      30


In [4]:
import string
from nltk.corpus import stopwords

In [5]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [6]:
stopwords.words('english')
#These words and punctuation don’t really say anything about people’ emotions,
#so we need to get rid of them if they are in the tweets.

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [7]:
def text_processing(mess):
    nopunc = [char for char in mess if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]


#Here we have a function named text_processing, and we are processing lines/tweets. 
#You can name them anything. Ex: bake(pizza). A mess/pizza is a input, and we process/bake it. 
#At this point, this is just a raw function. 
#Think of it as y = ax + b. Now we do something with it.

#nopunc = [char for char in mess if char not in string.punctuation]
#this is a for loop in Python, and it basically says: for character in line, 
#if character is not in the punctuation list, take it.
#So if I have a line: “Hi! My name is Hung “William” Mai, and I am from Vietnam.” 
#This “sub-function” will take everything that is not a punctation and put them in a list.
#So you should have nopunc = [Hi, My, name, is, Hung, William, Mai, and, I, am, from, Vietnam]. 
#All the punctuations were deleted, and you are left with a list of words

#nopunc = ''.join(nopunc)
#return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
#Now that we have a list with all the elements separated by a comma, 
#we would want to put them all again to be processed/joined again 
#so that it can be processed with another sub-function to get rid of all the unimportant words.
#From “Hi My name is Hung William Mai and I am from Vietnam”, we would get 
#“Hi name Hung William Mai Vietnam”, then a list that has [Hi, name, Hung, William, Mai, Vietnam].
#In general, we only take what we need.

In [8]:
messages['message'].head().apply(text_processing)

0    [Go, jurong, point, crazy, Available, bugis, n...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3        [U, dun, say, early, hor, U, c, already, say]
4    [Nah, dont, think, goes, usf, lives, around, t...
Name: message, dtype: object

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
bow_transformer = CountVectorizer(analyzer=text_processing).fit(messages['message'])

#Now we'll convert each tweets, represented as a list of tokens (lemmas) above, 
#into a vector that machine learning models can understand.
#Doing that requires essentially three steps, in the bag-of-words model:


#counting how many times does a word occur in each message (term frequency)
#weighting the counts, so that frequent tokens get lower weight (inverse document frequency)
#normalizing the vectors to unit length, to abstract from the original text length (L2 norm)
#Each vector has as many dimensions as there are unique words in the SMS corpus:

In [11]:
print(len(bow_transformer.vocabulary_))

11304


In [12]:
mess4 = messages['message'][4]
print (mess4)

#Here I have used scikit-learn (sklearn), a powerful Python library for teaching machine learning. 
#It contains a multitude of various methods and options.
#Let's take one text message and get its bag-of-words counts as a vector,
#putting to use our new bow_transformer:

Nah I don't think he goes to usf, he lives around here though


In [13]:
bow4 = bow_transformer.transform([mess4])

In [14]:
print(bow4.shape)

(1, 11304)


In [15]:
print(bow4)

#So, nine unique words in message nr. 4, two of them appear twice, the rest only once. 
#Sanity check: what are these words the appear twice?

  (0, 2897)	1
  (0, 4698)	1
  (0, 6038)	1
  (0, 6786)	1
  (0, 7740)	1
  (0, 10320)	1
  (0, 10337)	1
  (0, 10681)	1


In [16]:
messages_bow =bow_transformer.transform(messages['message'])

In [17]:
messages_bow.shape

(5572, 11304)

In [18]:
messages_bow.nnz

50193

In [19]:
sparsity = 100.0*messages_bow.nnz/(messages_bow.shape[0]*messages_bow.shape[1])

In [20]:
sparsity

0.07968927896991783

In [21]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(messages_bow)
tfidf4 = tfidf_transformer.transform(bow4)
print(tfidf4)

#after the counting, the term weighting and normalization can be done with TF-IDF, 
#using scikit-learn's TfidfTransformer


#Occurrence count is a good start but there is an issue: 
#longer documents will have higher average count values than shorter documents, 
#even though they might talk about the same topics.
#To avoid these potential discrepancies it suffices to divide the number of occurrences 
#of each word in a document by the total number of words in the document: these new features 
#are called tf for Term Frequencies.
#Another refinement on top of tf is to downscale weights for words that occur 
#in many documents in the corpus and are therefore less informative than those 
#that occur only in a smaller portion of the corpus.
#This downscaling is called tf–idf for “Term Frequency times Inverse Document Frequency”.

  (0, 10681)	0.401736248346
  (0, 10337)	0.353924419133
  (0, 10320)	0.270324575091
  (0, 7740)	0.445560397623
  (0, 6786)	0.353924419133
  (0, 6038)	0.242504809384
  (0, 4698)	0.310294953567
  (0, 2897)	0.401736248346


In [22]:
#checking Inverse document freq. for a one word
tfidf_transformer.idf_[bow_transformer.vocabulary_['university']]
#What is the IDF (inverse document frequency) of the word "u"? Of word "university"?

8.527076498901426

In [23]:
messages_tfidf = tfidf_transformer.transform(messages_bow)

#To transform the entire bag-of-words corpus into TF-IDF corpus at once:
#we firstly use the fit(..) method to fit our estimator to the data 
#and secondly the transform(..) method to transform our count-matrix to a tf-idf representation. 
#These two steps can be combined to achieve the same end result faster by skipping redundant processing. 
#This is done through using the fit_transform(..) method

In [24]:
messages_tfidf.shape

(5572, 11304)

In [25]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(messages_tfidf, messages['label'])
spam_detect_model.predict(tfidf4)[0]



'ham'

In [26]:
all_pred = spam_detect_model.predict(messages_tfidf)
all_pred

array(['ham', 'ham', 'spam', ..., 'ham', 'ham', 'ham'],
      dtype='<U4')

In [27]:
from sklearn.cross_validation import train_test_split



In [28]:
msg_train,msg_test,label_train,label_test = train_test_split(messages['message'],
                                                            messages['label'], 
                                                            test_size = 0.3)

In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [30]:
pipeline = Pipeline([('bow', CountVectorizer(analyzer=text_processing)),('tfidf', TfidfTransformer()),('classifier', MultinomialNB())])

In [31]:
pipeline.fit(msg_train, label_train)

Pipeline(memory=None,
     steps=[('bow', CountVectorizer(analyzer=<function text_processing at 0x000000A6553AA950>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preproce...f=False, use_idf=True)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [32]:
predictions = pipeline.predict(msg_test)

In [33]:
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(label_test, predictions))

             precision    recall  f1-score   support

        ham       0.96      1.00      0.98      1431
       spam       1.00      0.72      0.84       241

avg / total       0.96      0.96      0.96      1672



In [34]:
print(confusion_matrix(label_test, predictions))

[[1431    0]
 [  67  174]]


In [35]:
#fitting a Random forest classifier
pipeline2 = Pipeline([
    ('bow', CountVectorizer(analyzer=text_processing)),
    ('tfidf', TfidfTransformer()),
    ('classifier', RandomForestClassifier())
])

In [36]:
pipeline2.fit(msg_train, label_train)

Pipeline(memory=None,
     steps=[('bow', CountVectorizer(analyzer=<function text_processing at 0x000000A6553AA950>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preproce...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [37]:
predictions2 = pipeline2.predict(msg_test)
print(classification_report(label_test, predictions2))

             precision    recall  f1-score   support

        ham       0.96      1.00      0.98      1431
       spam       0.99      0.78      0.88       241

avg / total       0.97      0.97      0.97      1672



In [38]:
import numpy as np
import pandas as pd # pandas is also used in the iloc function which is some kind of integer position based kind of thing
import matplotlib.pyplot as plt
import string
import nltk
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from subprocess import check_output

In [39]:
def pre_process(text):    
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = [word for word in text.split() if word.lower() not in stopwords.words('english')]
    words = ""
    for i in text:
            stemmer = SnowballStemmer("english")
            words += (stemmer.stem(i))+" "
    return words

In [40]:
textFeatures = messages['message'].copy()
textFeatures = textFeatures.apply(pre_process)
vectorizer = TfidfVectorizer("english")
features = vectorizer.fit_transform(textFeatures)
features_train, features_test, labels_train, labels_test = train_test_split(features, messages['label'], test_size=0.3, random_state=111)

In [41]:
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

svc = SVC(kernel='sigmoid', gamma=1.0)
svc.fit(features_train, labels_train)
prediction = svc.predict(features_test)
accuracy_score(labels_test,prediction)

0.97846889952153115

In [42]:
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

svc = SVC(kernel='sigmoid', gamma=2.0)
svc.fit(features_train, labels_train)
prediction = svc.predict(features_test)
accuracy_score(labels_test,prediction)

0.97248803827751196

In [43]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB(alpha=0.2)
mnb.fit(features_train, labels_train)
prediction = mnb.predict(features_test)
accuracy_score(labels_test,prediction)

0.98504784688995217

In [44]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.stem.porter import PorterStemmer
#from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [45]:
y=messages.iloc[:,0].values

In [46]:
orpus=[]
for i in range(0,5572):
    review = re.sub('[^a-zA-Z]',' ',messages['message'][i])
    review=review.lower()
    review=review.split()
    ps=PorterStemmer()
    review=[ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review=' '.join(review)
    orpus.append(review)

In [47]:
from sklearn.feature_extraction.text import CountVectorizer    
cv=CountVectorizer(max_features=3000)
x=cv.fit_transform(orpus).toarray()
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder() 
y=le.fit_transform(y) 

In [48]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.20)
from sklearn.naive_bayes import GaussianNB
classifier=GaussianNB()
classifier.fit(X_train,y_train)

GaussianNB(priors=None)

In [49]:
pred=classifier.predict(X_test)
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
print('Accuracy score: {}'.format(accuracy_score(y_test, pred)))
print('Precision score: {}'.format(precision_score(y_test, pred)))
print('Recall score: {}'.format(recall_score(y_test, pred)))
print('F1 score: {}'.format(f1_score(y_test, pred)))

Accuracy score: 0.8726457399103139
Precision score: 0.5283018867924528
Recall score: 0.89171974522293
F1 score: 0.6635071090047393


In [50]:
from sklearn.ensemble import RandomForestClassifier
classifier1=RandomForestClassifier(n_estimators=15,criterion='entropy')
classifier1.fit(X_train,y_train)
predRF=classifier1.predict(X_test)
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
print('Accuracy score: {}'.format(accuracy_score(y_test, predRF)))
print('Precision score: {}'.format(precision_score(y_test, predRF)))
print('Recall score: {}'.format(recall_score(y_test, predRF)))
print('F1 score: {}'.format(f1_score(y_test, predRF)))
#from subprocess import check_output
#print(check_output(["ls", "../input"]).decode("utf8"))

Accuracy score: 0.9766816143497757
Precision score: 1.0
Recall score: 0.8343949044585988
F1 score: 0.9097222222222222
