In [None]:
## loading all the library
import os
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords
nltk.download("stopwords")
from nltk.stem.porter import PorterStemmer

stop_word=set(stopwords.words('english'))
import numpy as np
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
file = open ('SMSSpamCollection', 'r') ## reading the dataset

In [None]:
data= pd.DataFrame()

In [None]:
classes=[]
text =[]
for content in file:
  classes.append((content.split('\t'))[0])
  text.append((content.split('\t'))[1])



In [None]:
data['Class']= classes ## making the dataset in pandas dataframe form
data['text']= text

In [None]:
data.head() 

Unnamed: 0,Class,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...\n
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
new_class=[]   ## changing the value of ham as 0 and spam as 1
for txt in data['Class']:
  if txt == 'ham':
    new_class.append(0)
  elif txt == 'spam':
    new_class.append(1)
  
data['Class']= new_class

In [None]:
data.head()

Unnamed: 0,Class,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...\n
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
## define a text_preprocessing_pipeline
def textprocessing(text):
  text=" ".join(text.split())  ## removing the unnecessary whitespace
  text = text.lower()  ## making lowercase
  # porter = PorterStemmer()
  text = "".join([char for char in text if char not in string.punctuation]) ## removing punctuation
  # text = "".join([char for char in text if char not in stop_word])
  # text = "".join([porter.stem(word) for word in text])

  return text

  


In [None]:
## making tfidf pipeline to covert the text data to tfidf vector
def tfidf(text):
  vectorizer = TfidfVectorizer(analyzer='word',stop_words= 'english')
  tfidf_wm = vectorizer.fit_transform(text)
  tfidf_tokens = vectorizer.get_feature_names()
  df_tfidfvect = pd.DataFrame(data = tfidf_wm.toarray(),columns = tfidf_tokens)

  return df_tfidfvect



In [None]:
## after performing tfidf the dimension will be huge so function to use pca to reduce dimension
def pca(datas, dimension):
  pca = PCA(n_components=dimension)
  pca.fit(datas)
  new_data= pca.transform(datas)
  return new_data

In [None]:
## train test and validation split
train, validate, test = np.split(data.sample(frac=1, random_state=42), 
                       [int(.6*len(data)), int(.8*len(data))])


In [None]:
## a function to perform all the data perprocessing step
def cleaning_data(data):
  data['text']= data['text'].apply(textprocessing)
  tfidf_data= tfidf(data['text'].values.tolist()) ## converting into tfidf vector
  tfidf_data= pca(tfidf_data,20) ## reducing the dimension with 20
  final_data = pd.DataFrame(tfidf_data, columns=list(range(1,21)))
  final_data['target']= data['Class'].values.tolist()
  return final_data

In [None]:
clean_train= cleaning_data(train)
clean_test = cleaning_data(test)
clean_valid= cleaning_data(validate)



In [None]:
clean_test.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,12,13,14,15,16,17,18,19,20,target
0,-0.006106,-0.017323,0.010018,0.000162,-0.010869,-0.014983,-0.017576,-0.023403,-0.0112,-0.003491,...,-0.003887,0.007716,0.004664,-0.045852,-0.027145,-0.016353,0.012189,-0.0111,0.009411,0
1,-0.02559,-0.011748,-0.011015,-0.045809,-0.03508,-0.019468,-0.03152,-0.014525,-0.04843,0.009975,...,0.030729,0.01752,0.006541,-0.035717,0.018211,0.00656,0.09827,0.058421,-0.024556,0
2,-0.013536,-0.030806,0.000213,-0.045282,0.034772,0.252739,-0.044197,0.003539,0.048456,-0.00033,...,0.04921,-0.00612,0.013758,-0.055657,-0.020782,0.056603,-0.010829,-0.039656,-0.041803,0
3,0.233055,0.000143,-0.041087,-0.07743,-0.124727,-0.124795,0.057186,0.131462,0.21041,-0.061417,...,-0.063327,-0.071557,0.113556,-0.060079,-0.009442,0.108288,0.005616,0.012295,0.074677,0
4,-0.020847,-0.017792,-0.027428,0.04861,-0.00302,-0.057793,-0.083301,-0.078412,-0.045958,0.080065,...,0.066751,0.15913,-0.036188,0.239901,-0.101722,-0.089125,-0.063903,-0.000301,0.196552,0


In [None]:
clean_train.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,12,13,14,15,16,17,18,19,20,target
0,-0.003173,-0.000882,0.033354,0.030973,-0.010749,0.018649,-0.011156,-0.030592,-0.001529,0.054519,...,0.004711,0.084619,0.045057,-0.008905,0.006092,-0.090259,-0.02217,0.044593,-0.062312,0
1,-0.023781,-0.0156,0.069585,-0.029231,-0.006057,0.052954,-0.012233,0.015796,0.037437,0.035407,...,0.09451,-0.000123,-0.001441,-0.000575,0.090334,-0.037713,-0.018147,-0.030219,0.020128,0
2,-0.011903,-0.016064,-0.01515,-0.006831,-0.013297,0.006234,-0.004035,-0.001978,-0.003765,0.024239,...,0.042963,-0.007169,-0.000556,0.005347,0.025716,-0.023383,-0.016988,0.000309,-0.014644,0
3,0.047622,0.452925,-0.032184,0.005439,0.025612,0.037362,-0.059981,-0.015386,-0.033357,0.022429,...,0.04903,-0.026254,0.01035,-0.027137,0.012367,0.024758,0.01318,-0.050674,-0.071168,0
4,-0.017249,-0.009569,0.013907,0.043203,-0.014938,0.004608,0.00477,0.046237,0.014187,-0.008562,...,0.192907,-0.050289,0.220694,-0.017173,-0.008997,0.025578,0.139728,-0.013193,0.067912,0


In [None]:
clean_valid.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,12,13,14,15,16,17,18,19,20,target
0,-0.018448,-0.038103,0.005937,-0.018635,-0.026736,-0.001622,0.024851,-0.002407,-0.014209,0.013387,...,-0.032802,0.004427,-0.019779,-0.011858,0.002304,-0.028436,0.04745,0.003976,-0.015014,1
1,-0.015554,-0.031923,0.004686,-0.015176,-0.009926,-0.003197,0.021329,-0.009572,-0.006486,0.020609,...,-0.024409,0.011212,-0.01681,-0.005173,-0.003157,-0.015212,0.019044,0.025107,-0.00497,0
2,-0.030514,-0.058019,-0.006211,0.033809,0.08655,-0.028042,-0.091,-0.123521,0.038452,0.030805,...,0.10449,0.011378,-0.023989,-0.053986,0.0155,-0.135022,-0.088638,-0.043007,0.099628,0
3,-0.018416,-0.04844,0.018754,0.056281,-0.050306,0.096529,0.105008,0.049266,-0.047686,-0.068799,...,-0.014183,-0.040762,0.032227,-0.066526,-0.097143,0.051571,-0.045706,-0.01255,0.069361,0
4,-0.017369,-0.032273,0.005637,-0.013686,-0.010661,-0.006092,0.022861,0.005283,-0.016279,0.008456,...,-0.023937,0.00142,-0.014803,-0.013904,0.009008,-0.009298,0.024897,0.015028,-0.008649,0


In [None]:
clean_train.to_csv('clean_train.csv') ## exporting all clean training data
clean_test.to_csv('clean_test.csv')## exporting all clean training data
clean_valid.to_csv('clean_valid.csv') ## exporting all clean vaildation data

In [None]:
## data is imbalance
import imblearn
from imblearn.over_sampling import RandomOverSampler

In [None]:
oversample = RandomOverSampler(sampling_strategy='minority')

In [None]:
X_over, y_over = oversample.fit_resample(X_train, Y_train)

NameError: ignored

In [None]:
## making a bayes classifier model
from sklearn.naive_bayes import GaussianNB

In [None]:
gnb = GaussianNB()

In [None]:
model= gnb.fit(X_over,y_over)

In [None]:
def evaluate(model, X_test, Y_test):
  correct=0
  total=0
  pre= model.predict(X_test)
  for i in range(len(pre)):
    if pre[i]==Y_test[i]:
      correct+=1
    total+=1
  return correct/total
 



In [None]:
evaluate(model,X_over,y_over)

In [None]:
validate['text']=validate['text'].apply(textprocessing)

In [None]:
tfidf_val= tfidf(validate['text'].values.tolist())

In [None]:
pca_val = pca(tfidf_val,20)

In [None]:
X_val=pca_val
Y_val = (validate['Class'].values)

In [None]:
model.predict(X_val)

In [None]:
evaluate(model, X_val,Y_val )

In [None]:
## random forest
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf = RandomForestClassifier(max_depth=20, random_state=0)

In [None]:
model2=clf.fit(X_over,y_over)

In [None]:
evaluate(model2, X_val,Y_val )