In [1]:
import pandas as pd
import numpy as np

spam_data = pd.read_csv('spam.csv')

spam_data['target'] = np.where(spam_data['target']=='spam',1,0)
spam_data.head(10)
len(spam_data)

5572

In [2]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(spam_data['text'], 
                                                    spam_data['target'], 
                                                    random_state=0)

In [4]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(X_train)
feature_vector = vectorizer.transform(X_train)
classifierNB = MultinomialNB(alpha=0.1)
classifierNB.fit(feature_vector, y_train)
predictions = classifierNB.predict(vectorizer.transform(X_test))

In [7]:
from sklearn.metrics import average_precision_score
from sklearn.metrics import confusion_matrix

confusion_matrix1 = confusion_matrix(y_test, predictions)


In [8]:
confusion_matrix1

array([[1196,    0],
       [  11,  186]], dtype=int64)

In [9]:
classifierNB.predict(vectorizer.transform(['won a award']))

array([1])

In [11]:
def add_feature(X, feature_to_add):
    """
    Returns sparse feature matrix with added feature.
    feature_to_add can also be a list of features.
    """
    from scipy.sparse import csr_matrix, hstack
    return hstack([X, csr_matrix(feature_to_add).T], 'csr')

In [13]:
from sklearn.linear_model import LogisticRegression

vect = CountVectorizer(min_df=5, ngram_range=(2,5), analyzer='char_wb').fit(X_train)
X_train_vectorized = vect.transform(X_train)
print('train_vec_1 - {}'.format(X_train_vectorized.toarray().shape))
print('length of features - {}'.format(len(vect.get_feature_names())))
print('last feature - {}'.format(vect.get_feature_names()[-1]))
X_train_vectorized = add_feature(X_train_vectorized, X_train.str.len())
print('train_vec_add_len - {}'.format(X_train_vectorized.toarray().shape))
print('length of features - {}'.format(len(vect.get_feature_names())))
print('last feature - {}'.format(vect.get_feature_names()[-1]))
X_train_digits = X_train.str.findall(r'(\d)')
X_train_vectorized = add_feature(X_train_vectorized, list(map(len, X_train_digits)))
print('train_vec_add_dig - {}'.format(X_train_vectorized.toarray().shape))
print('length of features - {}'.format(len(vect.get_feature_names())))
print('last feature - {}'.format(vect.get_feature_names()[-1]))
X_train_nonChar = X_train.str.findall(r'(\W)')
X_train_vectorized = add_feature(X_train_vectorized, list(map(len, X_train_nonChar)))
print('train_vec_add_nonchar - {}'.format(X_train_vectorized.toarray().shape))
print('length of features - {}'.format(len(vect.get_feature_names())))
print('last feature - {}'.format(vect.get_feature_names()[-1]))

X_test_vectorized = vect.transform(X_test)

X_test_vectorized = add_feature(X_test_vectorized, X_test.str.len())
X_test_digits = X_test.str.findall(r'(\d)')
X_test_vectorized = add_feature(X_test_vectorized, list(map(len, X_test_digits)))
X_test_nonChar = X_test.str.findall(r'(\W)')
X_test_vectorized = add_feature(X_test_vectorized, list(map(len, X_test_nonChar)))

model = LogisticRegression(C=100)
model.fit(X_train_vectorized, y_train)


train_vec_1 - (4179, 16314)
length of features - 16314
last feature - û÷t 
train_vec_add_len - (4179, 16315)
length of features - 16314
last feature - û÷t 
train_vec_add_dig - (4179, 16316)
length of features - 16314
last feature - û÷t 
train_vec_add_nonchar - (4179, 16317)
length of features - 16314
last feature - û÷t 


LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [15]:
import pickle

pickle_out = open("C:/Users/suryanarayana.dunna/classifier_log_regression.pickle","wb")
pickle.dump(model, pickle_out)
pickle_out.close()