In [1]:
    #Load Libraries
    
    import numpy as np
    import pandas as pd
    import seaborn as sns
    import matplotlib.pyplot as plt
    import os
    from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
#Load Dataset
dataset = pd.read_csv("C:/Users/saiva/OneDrive/Documents/spam_assassin.csv")
dataset.head()

Unnamed: 0,text,target
0,From ilug-admin@linux.ie Mon Jul 29 11:28:02 2...,0
1,From gort44@excite.com Mon Jun 24 17:54:21 200...,1
2,From fork-admin@xent.com Mon Jul 29 11:39:57 2...,1
3,From dcm123@btamail.net.cn Mon Jun 24 17:49:23...,1
4,From ilug-admin@linux.ie Mon Aug 19 11:02:47 2...,0


In [3]:
#Split data into text and target
data, target = dataset.text, dataset.target
data.shape

(5796,)

In [4]:
data.head()

0    From ilug-admin@linux.ie Mon Jul 29 11:28:02 2...
1    From gort44@excite.com Mon Jun 24 17:54:21 200...
2    From fork-admin@xent.com Mon Jul 29 11:39:57 2...
3    From dcm123@btamail.net.cn Mon Jun 24 17:49:23...
4    From ilug-admin@linux.ie Mon Aug 19 11:02:47 2...
Name: text, dtype: object

In [5]:
target.shape

(5796,)

In [6]:
#Creating train and test data
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=0)

for train_index, test_index in sss.split(data, target):
    train_X, test_X = data.loc[train_index], data.loc[test_index]
    train_y, test_y = target.loc[train_index], target.loc[test_index]
    

In [7]:
print(train_X, train_y)

1264    Return-Path: tim.one@comcast.net Delivery-Date...
2540    From rssfeeds@jmason.org Thu Sep 26 16:41:31 2...
3116    From rssfeeds@jmason.org Tue Sep 24 10:47:47 2...
4812    From fork-admin@xent.com Wed Aug 28 18:17:25 2...
4213    From ilug-admin@linux.ie Mon Aug 12 11:07:30 2...
                              ...                        
1882    From Market_Research@spcu.spb.su Mon Jun 24 17...
1674    From ilug-admin@linux.ie Tue Aug 13 10:28:08 2...
317     From rssfeeds@jmason.org Thu Sep 26 16:43:17 2...
4874    From exmh-workers-admin@redhat.com Mon Jul 29 ...
1536    From ormlh@imail.ru Sun Jul 15 04:56:31 2001 R...
Name: text, Length: 4347, dtype: object 1264    0
2540    0
3116    0
4812    0
4213    0
       ..
1882    1
1674    0
317     0
4874    0
1536    1
Name: target, Length: 4347, dtype: int64


In [8]:
from sklearn.model_selection import cross_val_score, cross_val_predict

In [9]:
from sklearn.tree import DecisionTreeClassifier
vect = TfidfVectorizer(min_df=5, ngram_range=(1,3)).fit(train_X)
X_train_vectorized = vect.transform(train_X)


In [10]:
def add_feature(X, feature_to_add):
    
    from scipy.sparse import csr_matrix, hstack
    return hstack([X, csr_matrix(feature_to_add).T], 'csr')
 
 

add_length=train_X.str.len()
add_digits=train_X.str.count(r'\d')
add_dollars=train_X.str.count(r'\$')
add_characters=train_X.str.count(r'\W')
 
X_train_transformed = add_feature(X_train_vectorized , [add_length, add_digits,  add_dollars, add_characters])
 

add_length_t=test_X.str.len()
add_digits_t=test_X.str.count(r'\d')
add_dollars_t=test_X.str.count(r'\$')
add_characters_t=test_X.str.count(r'\W')
 
 
X_test_transformed = add_feature(vect.transform(test_X), [add_length_t, add_digits_t,  add_dollars_t, add_characters_t])

In [11]:
print(X_test_transformed)

  (0, 870)	0.028472240414707813
  (0, 1013)	0.09743201706859243
  (0, 1055)	0.031172424101657926
  (0, 1096)	0.01714216969640707
  (0, 1100)	0.017154239554711882
  (0, 1131)	0.01887263090373362
  (0, 1133)	0.01908788965536923
  (0, 1810)	0.014059081335246677
  (0, 1933)	0.037340650355548635
  (0, 1934)	0.037340650355548635
  (0, 1991)	0.08545477138358928
  (0, 2006)	0.0281356574930967
  (0, 2126)	0.0654847153264489
  (0, 2127)	0.0654847153264489
  (0, 2380)	0.05334692117524734
  (0, 2839)	0.010100647296315475
  (0, 2999)	0.034962495734484726
  (0, 3497)	0.03730311774591902
  (0, 3540)	0.00781475353829268
  (0, 3545)	0.007836467556691241
  (0, 3610)	0.08356719633956208
  (0, 3614)	0.03656072050244462
  (0, 3676)	0.00783284391783741
  (0, 3677)	0.008289237729013956
  (0, 4013)	0.01994644302695112
  :	:
  (1448, 106046)	0.011451375479532027
  (1448, 106101)	0.003471805956621508
  (1448, 106102)	0.003471805956621508
  (1448, 106330)	0.007767956462480521
  (1448, 106580)	0.01472444642095851

In [12]:
from sklearn.ensemble import RandomForestClassifier


In [13]:
forest_clf = RandomForestClassifier(random_state = 42)
y_probas_forest = cross_val_predict(forest_clf , X_train_transformed , train_y , cv = 3 , method = 'predict_proba')

In [15]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = [
    {'n_estimators' : [3, 10, 30], 'max_features' : [2, 4, 6, 8]},
    {'bootstrap' : [False], 'n_estimators' : [3, 10], 'max_features' : [2, 3, 4]},
    
]

forest_reg = RandomForestClassifier()

random_search = RandomizedSearchCV(forest_reg, param_grid, cv = 5,
                          scoring = 'neg_mean_squared_error',
                          return_train_score = True)

random_search.fit(X_train_transformed , train_y)

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(),
                   param_distributions=[{'max_features': [2, 4, 6, 8],
                                         'n_estimators': [3, 10, 30]},
                                        {'bootstrap': [False],
                                         'max_features': [2, 3, 4],
                                         'n_estimators': [3, 10]}],
                   return_train_score=True, scoring='neg_mean_squared_error')

In [16]:
random_search.best_params_


{'n_estimators': 30, 'max_features': 8}

In [17]:
random_search.best_estimator_


RandomForestClassifier(max_features=8, n_estimators=30)

In [19]:
final_model = random_search.best_estimator_
final_model.fit(X_train_transformed, train_y)


RandomForestClassifier(max_features=8, n_estimators=30)

In [21]:
predictions = final_model.predict(X_test_transformed)


In [22]:

from sklearn.metrics import accuracy_score
accuracy_score(predictions, test_y)


0.9868875086266391