In [1]:
import spacy

In [4]:
nlp = spacy.load('en_core_web_sm')

In [5]:
nlp


<spacy.lang.en.English at 0x22d68d28cf8>

In [6]:
doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million')

In [7]:
doc

Tesla is looking at buying U.S. startup for $6 million

In [11]:
for token in doc:
    print(token.text,token.pos_,token.dep_)

Tesla PROPN nsubj
is VERB aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.S. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
6 NUM compound
million NUM pobj


In [12]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x22d68dab160>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x22d6a386a68>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x22d6a386ac8>)]

In [16]:
doc2 = nlp(u"mundu isn't a good boy.")

In [22]:
for token in doc2:
    print(token.text,token.pos_,token.dep_)

mundu ADJ nsubj
is VERB ROOT
n't ADV neg
a DET det
good ADJ amod
boy NOUN attr
. PUNCT punct


In [23]:
doc2[0].is_sent_start

True

##

In [26]:
mystring = '"we\'re moving to L.A. ! "'

In [27]:
print(mystring)

"we're moving to L.A. ! "


In [28]:
doc = nlp(mystring)

In [29]:
for token in doc:
    print(token)

"
we
're
moving
to
L.A.
!
"


In [30]:
from spacy import displacy

In [31]:
doc = nlp(u'Apple is going to build a U.K. factory for $6 million.')

In [34]:
displacy.render(doc,style='dep',jupyter= True,options={'distance':70})

In [4]:
from sklearn.linear_model import LinearRegression

In [5]:
model = LinearRegression(normalize=True)

In [6]:
model

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

In [7]:
print(model)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)


In [8]:
import numpy as np

In [9]:
import pandas as pd

In [10]:
df = pd.read_csv('smsspamcollection.tsv',sep = '\t')

In [11]:
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [13]:
df.isnull().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [14]:
len(df)

5572

In [19]:
df['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
X = df[['length','punct']]

In [22]:
y = df['label']

In [23]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state = 42)

In [25]:
X_train.shape

(3900, 2)

In [30]:
from sklearn.linear_model import LogisticRegression

In [32]:
lr_model = LogisticRegression(solver = 'lbfgs')

#good

In [33]:
lr_model.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [34]:
from sklearn import metrics

In [36]:
predictions = lr_model.predict(X_test)

In [37]:
metrics.confusion_matrix(y_test,predictions)

array([[1404,   44],
       [ 219,    5]], dtype=int64)

In [38]:
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

         ham       0.87      0.97      0.91      1448
        spam       0.10      0.02      0.04       224

   micro avg       0.84      0.84      0.84      1672
   macro avg       0.48      0.50      0.48      1672
weighted avg       0.76      0.84      0.80      1672



In [39]:
print(metrics.accuracy_score(y_test,predictions))

0.8427033492822966


In [46]:
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()

nb_model.fit(X_train,y_train)

prediction_nb= nb_model.predict(X_test)

print(metrics.confusion_matrix(y_test,prediction_nb))

[[1438   10]
 [ 224    0]]


In [47]:
print(metrics.accuracy_score(y_test,prediction_nb))

0.8600478468899522


In [48]:
from sklearn.svm import SVC

In [49]:
svc_model = SVC(gamma='auto')

svc_model.fit(X_train,y_train)

predictions_svc = svc_model.predict(X_test)

print(metrics.confusion_matrix(y_test,predictions_svc))

[[1373   75]
 [ 121  103]]


In [50]:
print(metrics.accuracy_score(y_test,predictions_svc))

0.8827751196172249


In [51]:
#feature extraction from text
#term frequency and inverse document frequency
#DOC term matrics
#logarithmic scaled inverse fraction of the documents that contain the word
#divide total no of doc by no of dos containing the word and then take log 

In [52]:
from sklearn.feature_extraction.text import CountVectorizer

In [53]:
vect = CountVectorizer()

In [58]:
#tfidf vectorizer
#term frequency
#but it will emphasize documents which happens to use the word the more frequently
#tfidf = term frequency inverse document frequency factor

In [59]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [60]:
#begin

In [62]:
import numpy as np
import pandas as pd

In [63]:
X = df['message']

In [64]:
y = df['label']

In [65]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size =0.33,random_state= 42)

In [67]:
#fit the vectorizer to the data(build a vocab,count the no of words,...)
#tranform the original text messages to the vector
vect.fit(X_train)

X_train_counts = vect.transform(X_train)

In [70]:
X_train_counts.shape

(3733, 7082)

In [72]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()

In [74]:
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [75]:
X_train_tfidf.shape


(3733, 7082)

In [79]:
from sklearn.svm import LinearSVC

clf = LinearSVC()

clf.fit(X_train_tfidf,y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [93]:
#pipeling to get result faster

In [80]:
from sklearn.pipeline import Pipeline

text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())])

In [81]:
text_clf.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [82]:
predictions = text_clf.predict(X_test)

In [83]:
print(metrics.confusion_matrix(y_test,predictions))

[[1586    7]
 [  12  234]]


In [92]:
#feature extraction svc output

In [91]:
print(metrics.accuracy_score(y_test,predictions))

0.989668297988037


In [90]:
#predict on a independent message

In [88]:
text_clf.predict(['I will visit you today morining'])

array(['ham'], dtype=object)

In [89]:
text_clf.predict(['congratulations! you have been selected winner. TEXT WON to 44255 congratulations free entry to contest'])

array(['spam'], dtype=object)