In [1]:
## loading all the library
import os
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords
nltk.download("stopwords")
from nltk.stem.porter import PorterStemmer

stop_word=set(stopwords.words('english'))
import numpy as np
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
file = open ('SMSSpamCollection', 'r') ## reading the dataset

In [3]:
data= pd.DataFrame()

In [4]:
classes=[]
text =[]
for content in file:
  classes.append((content.split('\t'))[0])
  text.append((content.split('\t'))[1])



In [5]:
data['Class']= classes ## making the dataset in pandas dataframe form
data['text']= text

In [6]:
data.head() 

Unnamed: 0,Class,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...\n
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
new_class=[]   ## changing the value of ham as 0 and spam as 1
for txt in data['Class']:
  if txt == 'ham':
    new_class.append(0)
  elif txt == 'spam':
    new_class.append(1)
  
data['Class']= new_class

In [8]:
data.head()

Unnamed: 0,Class,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...\n
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
## define a text_preprocessing_pipeline
def textprocessing(text):
  text=" ".join(text.split())  ## removing the unnecessary whitespace
  text = text.lower()  ## making lowercase
  # porter = PorterStemmer()
  text = "".join([char for char in text if char not in string.punctuation]) ## removing punctuation
  # text = "".join([char for char in text if char not in stop_word])
  # text = "".join([porter.stem(word) for word in text])

  return text

  


In [10]:
## making tfidf pipeline to covert the text data to tfidf vector
def tfidf(text):
  vectorizer = TfidfVectorizer(analyzer='word',stop_words= 'english')
  tfidf_wm = vectorizer.fit_transform(text)
  tfidf_tokens = vectorizer.get_feature_names()
  df_tfidfvect = pd.DataFrame(data = tfidf_wm.toarray(),columns = tfidf_tokens)

  return df_tfidfvect



In [11]:
## after performing tfidf the dimension will be huge so function to use pca to reduce dimension
def pca(datas, dimension):
  pca = PCA(n_components=dimension)
  pca.fit(datas)
  new_data= pca.transform(datas)
  return new_data

In [12]:
## train test and validation split
train, validate, test = np.split(data.sample(frac=1, random_state=42), 
                       [int(.6*len(data)), int(.8*len(data))])


In [13]:
## a function to perform all the data perprocessing step
def cleaning_data(data):
  data['text']= data['text'].apply(textprocessing)
  tfidf_data= tfidf(data['text'].values.tolist()) ## converting into tfidf vector
  tfidf_data= pca(tfidf_data,20) ## reducing the dimension with 20
  final_data = pd.DataFrame(tfidf_data, columns=list(range(1,21)))
  final_data['target']= data['Class'].values.tolist()
  return final_data

In [14]:
clean_train= cleaning_data(train)
clean_test = cleaning_data(test)
clean_valid= cleaning_data(validate)



In [15]:
clean_test.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,12,13,14,15,16,17,18,19,20,target
0,-0.006105,-0.017325,0.010101,0.000587,-0.011001,-0.015396,-0.016621,-0.023635,-0.011098,-0.001913,...,-0.002821,0.009033,0.004695,-0.052012,-0.018591,-0.00124,0.020452,-0.005065,-0.012176,0
1,-0.02559,-0.011748,-0.011149,-0.045213,-0.03514,-0.01899,-0.029956,-0.014789,-0.047194,0.004417,...,0.041884,0.010925,-0.001916,-0.020867,0.017855,-0.042483,0.088117,0.088321,0.030322,0
2,-0.013538,-0.030806,0.00028,-0.046705,0.035044,0.254204,-0.04203,0.006701,0.043682,0.002605,...,0.052317,-0.012119,0.011504,-0.056361,-0.028783,-0.059149,-0.053153,-0.021422,0.021236,0
3,0.233066,0.000126,-0.040726,-0.075089,-0.125572,-0.129061,0.058019,0.130669,0.208672,-0.049396,...,-0.075923,-0.074466,0.104386,-0.017134,-0.013579,-0.136973,-0.021289,0.000482,-0.094998,0
4,-0.020845,-0.017795,-0.027509,0.047868,-0.002569,-0.057829,-0.086679,-0.078255,-0.040286,0.064965,...,0.09115,0.152329,-0.018432,0.232292,-0.085449,0.168864,-0.000449,-0.078527,0.009495,0


In [16]:
clean_train.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,12,13,14,15,16,17,18,19,20,target
0,-0.003166,-0.000891,0.033063,0.032015,-0.011327,0.019789,-0.013574,-0.025091,0.001547,-0.053006,...,0.010189,0.080389,0.031988,-0.014973,0.00088,-0.082865,0.019051,-0.007707,-0.10182,0
1,-0.023786,-0.015607,0.069512,-0.030258,-0.005888,0.052176,-0.01537,0.012536,0.042024,-0.035834,...,0.085832,-0.027958,-0.006498,-0.029879,0.096344,0.015114,-0.03272,-0.028458,0.070633,0
2,-0.011908,-0.016071,-0.015163,-0.006879,-0.013066,0.00576,-0.005055,-0.002019,-0.004951,-0.023047,...,0.039146,-0.0117,-0.007505,-0.012449,0.034737,-0.013072,-0.01522,-0.030595,-0.020409,0
3,0.04762,0.452905,-0.031816,0.004548,0.02702,0.031953,-0.057714,-0.01805,-0.030397,-0.01702,...,0.02877,-0.016708,0.001508,-0.00478,-0.01271,-0.009503,-0.009527,-0.088585,0.006594,0
4,-0.017245,-0.009554,0.013812,0.04269,-0.014061,0.005701,0.006683,0.045074,0.013716,0.006301,...,0.207436,-0.054324,0.214601,-0.007222,-0.062318,0.07903,0.11729,0.010126,0.051181,0


In [17]:
clean_valid.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,12,13,14,15,16,17,18,19,20,target
0,-0.01845,-0.038118,0.005952,-0.019095,-0.026645,-0.001607,0.024348,-0.00347,-0.014079,0.015703,...,-0.036739,0.00502,-0.022452,0.004097,0.012492,0.027982,0.036194,-0.032344,-0.005755,1
1,-0.01556,-0.031909,0.004665,-0.015499,-0.010085,-0.003545,0.021916,-0.009188,-0.006875,0.023064,...,-0.025056,0.010294,-0.013687,0.006981,0.009316,0.007151,0.003568,-0.034666,-0.004112,0
2,-0.0305,-0.058061,-0.006372,0.032959,0.089534,-0.024079,-0.094906,-0.12638,0.047836,0.023912,...,0.100665,0.007336,-0.039511,-0.009753,0.026906,0.162498,-0.025313,0.100398,-0.011136,0
3,-0.018431,-0.048342,0.018915,0.056404,-0.052312,0.093512,0.108928,0.053027,-0.048366,-0.072426,...,-0.016548,-0.037903,0.031289,-0.04925,0.091251,-0.050996,-0.040244,0.081844,0.032811,0
4,-0.017367,-0.032291,0.005608,-0.013552,-0.010476,-0.006353,0.022377,0.003278,-0.017134,0.010249,...,-0.024179,0.000308,-0.012399,0.01234,0.009884,0.009616,0.015559,-0.028419,0.000108,0


In [18]:
clean_train.to_csv('clean_train.csv') ## exporting all clean training data
clean_test.to_csv('clean_test.csv')## exporting all clean training data
clean_valid.to_csv('clean_valid.csv') ## exporting all clean vaildation data