In [1]:
## loading all the library
import os
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords
nltk.download("stopwords")
from nltk.stem.porter import PorterStemmer

stop_word=set(stopwords.words('english'))
import numpy as np
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
file = open ('SMSSpamCollection', 'r') ## reading the dataset

In [3]:
data= pd.DataFrame()

In [4]:
classes=[]
text =[]
for content in file:
  classes.append((content.split('\t'))[0])
  text.append((content.split('\t'))[1])



In [5]:
data['Class']= classes ## making the dataset in pandas dataframe form
data['text']= text

In [6]:
data.head() 

Unnamed: 0,Class,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...\n
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
new_class=[]   ## changing the value of ham as 0 and spam as 1
for txt in data['Class']:
  if txt == 'ham':
    new_class.append(0)
  elif txt == 'spam':
    new_class.append(1)
  
data['Class']= new_class

In [8]:
data.head()

Unnamed: 0,Class,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...\n
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
## define a text_preprocessing_pipeline
def textprocessing(text):
  text=" ".join(text.split())  ## removing the unnecessary whitespace
  text = text.lower()  ## making lowercase
  # porter = PorterStemmer()
  text = "".join([char for char in text if char not in string.punctuation]) ## removing punctuation
  # text = "".join([char for char in text if char not in stop_word])
  # text = "".join([porter.stem(word) for word in text])

  return text

  


In [10]:
## making tfidf pipeline to covert the sentences to tfidf vector
def tfidf(text):
  vectorizer = TfidfVectorizer(analyzer='word',stop_words= 'english')
  tfidf_wm = vectorizer.fit_transform(text)
  tfidf_tokens = vectorizer.get_feature_names()
  df_tfidfvect = pd.DataFrame(data = tfidf_wm.toarray(),columns = tfidf_tokens)

  return df_tfidfvect



In [11]:
## after performing tfidf the dimension will be huge so function to use pca to reduce dimension
def pca(datas, dimension):
  pca = PCA(n_components=dimension)
  pca.fit(datas)
  new_data= pca.transform(datas)
  return new_data

In [12]:
## train test and validation split
train, validate, test = np.split(data.sample(frac=1, random_state=42), 
                       [int(.6*len(data)), int(.8*len(data))])


In [13]:
train.head() ## training data

Unnamed: 0,Class,text
3690,0,You still coming tonight?\n
3527,0,"""HEY BABE! FAR 2 SPUN-OUT 2 SPK AT DA MO... DE..."
724,0,Ya even those cookies have jelly on them\n
3370,0,Sorry i've not gone to that place. I.ll do so ...
468,0,When are you going to ride your bike?\n


In [14]:
validate.head() ## validation data

Unnamed: 0,Class,text
593,1,PRIVATE! Your 2003 Account Statement for 07753...
2725,0,Nope... C ü then...\n
41,0,"Did I forget to tell you ? I want you , I need..."
560,0,Aiyo... U always c our ex one... I dunno abt m...
5213,0,It is only yesterday true true.\n


In [15]:
test.head() ## test data

Unnamed: 0,Class,text
2106,0,I fetch yun or u fetch?\n
2679,0,Was playng 9 doors game and gt racing on phone...
2761,0,I dont thnk its a wrong calling between us\n
1387,0,All e best 4 ur exam later.\n
5235,0,Hey what how about your project. Started aha d...


In [16]:
## a function to perform all the data perprocessing step
def cleaning_data(data):
  data['text']= data['text'].apply(textprocessing)
  tfidf_data= tfidf(data['text'].values.tolist()) ## converting into tfidf vector
  tfidf_data= pca(tfidf_data,20) ## reducing the dimension with 20
  final_data = pd.DataFrame(tfidf_data, columns=list(range(1,21)))
  final_data['target']= data['Class'].values.tolist()
  return final_data

In [18]:
clean_train= cleaning_data(train)   ## final train data that we can use for training
clean_test = cleaning_data(test)    ## final test data that we can use for test
clean_valid= cleaning_data(validate) ## final validation data that we can use for validation



In [19]:
clean_test.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,12,13,14,15,16,17,18,19,20,target
0,-0.006105,-0.017323,0.010057,0.000274,-0.011144,-0.015193,-0.01635,-0.023676,-0.011499,-0.002646,...,-0.003914,0.010218,0.004458,0.053836,-0.001461,-0.006587,0.019337,-0.010404,-0.00041,0
1,-0.025594,-0.011758,-0.010969,-0.045783,-0.035524,-0.018129,-0.028588,-0.014294,-0.046671,0.001756,...,0.029974,0.01155,0.009914,0.016509,0.051373,-0.019118,0.101869,0.00426,0.033462,0
2,-0.013536,-0.030807,0.000317,-0.046056,0.035215,0.253355,-0.041604,0.002639,0.046256,-0.001079,...,0.048404,-0.005162,0.011269,0.045997,0.001198,-0.051737,-0.034553,-0.009904,-0.068255,0
3,0.233057,0.000168,-0.040955,-0.07672,-0.125092,-0.129146,0.056794,0.130694,0.207863,-0.062623,...,-0.058999,-0.068917,0.090431,0.045908,-0.006182,-0.149398,0.004727,-0.005485,0.020868,0
4,-0.02085,-0.01779,-0.027569,0.048827,-0.002189,-0.055186,-0.083184,-0.08383,-0.043786,0.064314,...,0.051773,0.171184,-0.008625,-0.165848,-0.195566,0.186226,-0.060218,0.064003,-0.022088,0


In [20]:
clean_train.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,12,13,14,15,16,17,18,19,20,target
0,-0.003172,-0.00088,0.033335,0.032479,-0.01084,0.018611,-0.012913,-0.0242,-0.000612,0.055941,...,0.009071,0.088389,0.034805,0.00779,-0.002388,0.066713,-0.031047,0.036699,-0.01538,0
1,-0.023782,-0.015588,0.069604,-0.028787,-0.004994,0.051897,-0.014085,0.017917,0.034323,0.03596,...,0.087974,-0.003221,-0.012929,0.050153,-0.04771,0.031589,-0.038082,-0.012691,0.002334,0
2,-0.011903,-0.016073,-0.015121,-0.006605,-0.013093,0.006361,-0.004433,-0.001506,-0.002556,0.024595,...,0.035523,-0.009052,-0.006713,0.019628,-0.020886,0.031094,-0.02682,-0.031437,-0.006116,0
3,0.047613,0.452908,-0.031915,0.003876,0.026786,0.033919,-0.059991,-0.017726,-0.031078,0.010558,...,0.030045,-0.028899,-0.006513,0.01742,0.018755,-0.052405,-0.011092,-0.09943,0.019863,0
4,-0.017246,-0.009538,0.013872,0.043196,-0.013942,0.004972,0.005626,0.04608,0.013401,-0.002426,...,0.203497,-0.036976,0.224047,0.001602,0.05936,-0.007898,0.12715,-0.007503,0.072395,0


In [21]:
clean_valid.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,12,13,14,15,16,17,18,19,20,target
0,-0.018448,-0.038185,0.0059,-0.019056,-0.02629,-0.000453,0.024932,-0.002321,-0.013862,0.012586,...,-0.028078,0.000986,-0.018513,0.008109,8.2e-05,0.023494,-0.046896,0.019305,-0.014557,1
1,-0.01556,-0.032046,0.004747,-0.015753,-0.010305,-0.002999,0.022536,-0.009068,-0.007074,0.016984,...,-0.019921,0.007712,-0.017686,0.006426,0.003084,-0.002418,-0.017448,0.041463,-0.001476,0
2,-0.030502,-0.058096,-0.006266,0.032978,0.090128,-0.023984,-0.098561,-0.122764,0.03829,0.041932,...,0.127322,0.010444,-0.040583,-0.018932,0.036763,0.153662,0.062423,-0.095878,0.051922,0
3,-0.01842,-0.048291,0.018776,0.056877,-0.05237,0.092424,0.108414,0.049076,-0.045116,-0.072901,...,-0.016051,-0.035613,0.036438,0.001579,0.115005,-0.03655,0.038497,-0.074285,0.014828,0
4,-0.017369,-0.032334,0.005615,-0.013934,-0.010269,-0.006095,0.022781,0.004579,-0.016256,0.008357,...,-0.021327,-0.000729,-0.012153,0.013691,-0.000406,0.004229,-0.024851,0.021776,-0.009855,0


In [22]:
clean_train.to_csv('clean_train.csv') ## exporting all clean training data
clean_test.to_csv('clean_test.csv')## exporting all clean training data
clean_valid.to_csv('clean_valid.csv') ## exporting all clean vaildation data