In [2]:
# Loading the iris data

from sklearn.datasets import load_iris
iris = load_iris()

In [5]:
# Storing the data and the response varaible in x and y respectively

X = iris.data
y = iris.target

In [6]:
# check the shapes of x and y

print(X.shape)
print(y.shape)

(150, 4)
(150,)


In [7]:
# conversting data to dataframe and viewing the first 5 rows

import pandas as pd

pd.DataFrame(X,columns = iris.feature_names).head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [8]:
# Examining the response vectors

print(y)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [9]:
# import the class

from sklearn.neighbors import KNeighborsClassifier


# instantiate the model 

knn = KNeighborsClassifier()

# Fit the model with data

knn.fit(X,y)



KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [10]:
# predict the response for new obs.

knn.predict([[3,5,4,2]])

array([1])

In [11]:
## REPRESENTING TEXT AS NUMERICAL DATA

simple_train = ['call you tonight', 'Call me a cab', 'please call me... PLEASE!']

In [12]:
# import and instantiate CountVectorizer 

from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

In [13]:
# Learn the 'vocabulary' of train data

vect.fit(simple_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [14]:
# Examine the fitted vocabulary

vect.get_feature_names()

['cab', 'call', 'me', 'please', 'tonight', 'you']

In [16]:
# transform the data into a document term matrix

simple_train_dtm = vect.transform(simple_train)
simple_train_dtm

<3x6 sparse matrix of type '<class 'numpy.int64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [17]:
# convert sparse matrix to dense matrix

simple_train_dtm.toarray()

array([[0, 1, 0, 0, 1, 1],
       [1, 1, 1, 0, 0, 0],
       [0, 1, 1, 2, 0, 0]], dtype=int64)

In [18]:
# convert the array to a dataframe

pd.DataFrame(simple_train_dtm.toarray(),columns= vect.get_feature_names())

Unnamed: 0,cab,call,me,please,tonight,you
0,0,1,0,0,1,1
1,1,1,1,0,0,0
2,0,1,1,2,0,0


In [19]:
# Checking the type of dtm

type(simple_train_dtm)

scipy.sparse.csr.csr_matrix

In [20]:
print(simple_train_dtm)

  (0, 1)	1
  (0, 4)	1
  (0, 5)	1
  (1, 0)	1
  (1, 1)	1
  (1, 2)	1
  (2, 1)	1
  (2, 2)	1
  (2, 3)	2


In [21]:
## Text for model testing

simple_test = ["please don't call me"]

In [23]:
## Converting the data to a dtm

simple_test_dtm = vect.transform(simple_test)
simple_test_dtm.toarray()

array([[0, 1, 1, 1, 0, 0]], dtype=int64)

In [24]:
# Converting the data to a DataFrame

pd.DataFrame(simple_test_dtm.toarray(),columns = vect.get_feature_names())

Unnamed: 0,cab,call,me,please,tonight,you
0,0,1,1,1,0,0


In [26]:
### Reading text data into pandas dataframe

path = 'C:/Users/kulkarni/Documents/NLP_PyCon2016/sms.tsv'
sms = pd.read_table(path,header = None,names = ['label','message'])

In [28]:
# Shape of data

sms.shape

(5572, 2)

In [31]:
# head of sms

sms.head(10)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [30]:
# Examining the class distribution

sms.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [32]:
# Converting label to a numerical variable:

sms['label_num'] = sms.label.map({'ham':0,'spam':1})

In [33]:
## Checking the conversion

sms.head(10)

Unnamed: 0,label,message,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
5,spam,FreeMsg Hey there darling it's been 3 week's n...,1
6,ham,Even my brother is not like to speak with me. ...,0
7,ham,As per your request 'Melle Melle (Oru Minnamin...,0
8,spam,WINNER!! As a valued network customer you have...,1
9,spam,Had your mobile 11 months or more? U R entitle...,1


In [34]:
# Defining X and y to use with the model

X = sms.message
y = sms.label_num
print(X.shape)
print(y.shape)

(5572,)
(5572,)


In [35]:
## Splitting data into train and test data

from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state = 1)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4179,)
(1393,)
(4179,)
(1393,)


In [36]:
# Vectorizing our data

#Instantiate the vectorizer

vect = CountVectorizer()


In [37]:
# Learn training data vocabulary, then use it

vect.fit(X_train)

X_train_dtm = vect.transform(X_train)

In [38]:
# Examining dtm

X_train_dtm

<4179x7456 sparse matrix of type '<class 'numpy.int64'>'
	with 55209 stored elements in Compressed Sparse Row format>

In [39]:
# Transforming the test data

X_test_dtm = vect.transform(X_test)
X_test_dtm

<1393x7456 sparse matrix of type '<class 'numpy.int64'>'
	with 17604 stored elements in Compressed Sparse Row format>

In [40]:
## Building and Evaluating a model using multinomial Naive Bayes

from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()


In [41]:
# training the model using X_train and y_train

%time nb.fit(X_train_dtm,y_train)

Wall time: 98 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [42]:
# Making the predictions on the test data

y_pred_class = nb.predict(X_test_dtm)

In [43]:
# Calculating the accuracy of the model

from sklearn import metrics

metrics.accuracy_score(y_test,y_pred_class)

0.98851399856424982

In [44]:
# Confusion Matrix

metrics.confusion_matrix(y_test,y_pred_class)

array([[1203,    5],
       [  11,  174]])

In [46]:
# False Positive

X_test[(y_pred_class == 1) & (y_test == 0)]

574               Waiting for your call.
3375             Also andros ice etc etc
45      No calls..messages..missed calls
3415             No pic. Please re-send.
1988    No calls..messages..missed calls
Name: message, dtype: object

In [None]:
# False Negative


In [47]:
# calculate predicted probabilities for X_test_dtm (poorly calibrated)
y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]
y_pred_prob

array([  2.87744864e-03,   1.83488846e-05,   2.07301295e-03, ...,
         1.09026171e-06,   1.00000000e+00,   3.98279868e-09])

In [48]:


# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)



0.98664310005369604

In [49]:
## Comparing models

## LOGISTIC REGRESSION

from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()

In [51]:
# Train the model using log reg

%time logreg.fit(X_train_dtm,y_train)

Wall time: 566 ms


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [52]:
# Make the class predictions for test data

y_pred_class = logreg.predict(X_test_dtm)

In [53]:
# Calculate the probabilities

y_pred_prob = logreg.predict_proba(X_test_dtm)[:, 1]
y_pred_prob

array([ 0.01269556,  0.00347183,  0.00616517, ...,  0.03354907,
        0.99725053,  0.00157706])

In [54]:
# Calculate the accuracy

metrics.accuracy_score(y_test, y_pred_class)

0.9877961234745154

In [55]:
# Calculate AUC

metrics.roc_auc_score(y_test,y_pred_class)

0.95634284947198855