# NLP Project -> TF-IDF,RandomForest

# Spam or Not Spam

#### Read CSV,Clean Data,Prepare Data and convert it into vector(TF-IDF), Use RandomeForest to predict

## Fetch Data

In [1]:
import nltk
import pandas as pd

pd.set_option('display.max_colwidth',100)
messages = pd.read_csv('spam.csv',encoding="latin-1")
messages.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [2]:
# drop unused columns
# columns_to_drop = ["Unnamed: 2 ", "Unnamed: 3","Unnamed: 4","Unnamed: 5","Unnamed: 6","Unnamed: 7","Unnamed: 8","Unnamed: 9","Unnamed: 10","Unnamed: 11","Unnamed: 12","Unnamed: 13","Unnamed: 14"]
# messages = messages.drop(columns=columns_to_drop)
messages.columns=['label','text']
messages['label'].value_counts()
print('Number of null in label: {}'.format(messages['label'].isnull().sum()))
print('Number of null in text: {}'.format(messages['text'].isnull().sum()))
messages.head()

Number of null in label: 0
Number of null in text: 0


Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [3]:
messages['label'].value_counts()

label
ham     4825
spam     747
Name: count, dtype: int64

## Clean Text

In [4]:
import string
import re
stopwords = nltk.corpus.stopwords.words('english')

def clean_text(text):
    text = "".join([char for char in text if char not in string.punctuation])
    tokens = re.split('\W+',text)
    text = [word for word in tokens if word not in stopwords]
    return text

## TF-IDF Vectorizer

In [5]:
#apply TF-IDF Vectorizer

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(messages['text'])
print(X_tfidf.shape)
print(tfidf_vect.get_feature_names_out())

(5572, 11524)
['' '0' '008704050406' ... 'é' 'ü' 'üll']


In [6]:
X_features = pd.DataFrame(X_tfidf.toarray())
X_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11514,11515,11516,11517,11518,11519,11520,11521,11522,11523
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## RandomForest For Classification

In [7]:
#random forest for classification
from sklearn.ensemble import RandomForestClassifier

In [8]:
from sklearn.metrics import precision_score,recall_score
from sklearn.model_selection import train_test_split

In [9]:
X_train,X_test,Y_train,Y_test = train_test_split(X_features,messages['label'],test_size=0.2)

print(X_train.shape,Y_train.shape)

(4457, 11524) (4457,)


In [10]:
rf = RandomForestClassifier()
rf_model = rf.fit(X_train,Y_train)

In [11]:
Y_pred = rf_model.predict(X_test)

In [13]:
precision = precision_score(Y_test,Y_pred,pos_label='spam')
recall = recall_score(Y_test,Y_pred,pos_label='spam')
print("precision : {} / Recall: {}".format(round(precision, 3),round(recall, 3)))

precision : 1.0 / Recall: 0.791


text = ["Free joins the workshop"]
text_tfidf = tfidf_vect.transform(text) #Use transform instead of fit_transform
X_features = pd.DataFrame(text_tfidf.toarray())
X_features.head()

In [27]:
text = ["Free joins the workshop"]
text_tfidf = tfidf_vect.transform(text) #Use transform instead of fit_transform
X_features = pd.DataFrame(text_tfidf.toarray())
X_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11514,11515,11516,11517,11518,11519,11520,11521,11522,11523
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
y_pred = rf_model.predict(text_tfidf)

In [29]:
y_pred

array(['ham'], dtype=object)