# NLP Basics: Reading In Text Data

### Read In Semi-structured Text Data

The raw dataset being used in this course can be found at: http://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection

In [29]:
# Read in and view the raw data
import pandas as pd

messages = pd.read_csv('spam.csv', encoding = "ISO-8859-1")
messages.head()
# Downloads/Advance_NLP_Python_ML/data/spam.csv

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [32]:
# messages = messages.drop(labels=["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
# messages.head()
messages.columns = ["label", "text"]
messages.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [33]:
messages.shape

(5572, 2)

In [34]:
messages["label"].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [36]:
messages.isnull().sum()

label    0
text     0
dtype: int64

In [38]:
pd.set_option("display.max_colwidth", 100)

In [None]:
# Drop unused columns and label columns that will be used
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]
messages.head()

In [None]:
# How big is this dataset?

In [None]:
# What portion of our text messages are actually spam?

In [None]:
# Are we missing any data?
print('Number of nulls in label: {}'.format())
print('Number of nulls in text: {}'.format())

In [39]:
messages.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [40]:
messages.sample(5)

Unnamed: 0,label,text
1155,ham,"Sorry man, accidentally left my phone on silent last night and didn't check it til I got up"
1638,ham,Great comedy..cant stop laughing da:)
3446,ham,Sitting ard nothing to do lor. U leh busy w work?
5506,ham,God's love has no limit. God's grace has no measure. God's power has no boundaries. May u have G...
4482,ham,True lov n care wil nevr go unrecognized. though somone often makes mistakes when valuing it. bu...


In [41]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [45]:
def remove_punct(text):
    text = "".join([char for char in text if char not in string.punctuation])
    return text

In [46]:
messages["clean_text"] = messages["text"].apply(lambda x : remove_punct(x))

In [47]:
messages.sample(5)

Unnamed: 0,label,text,clean_text
1278,ham,Can i meet Ì_ at 5.. As 4 where depends on where Ì_ wan 2 in lor..,Can i meet Ì at 5 As 4 where depends on where Ì wan 2 in lor
5099,ham,"Ah, well that confuses things, doesnt it? I thought was friends with now. Maybe i did the wrong ...",Ah well that confuses things doesnt it I thought was friends with now Maybe i did the wrong thin...
3669,ham,Ok thanx... Take care then...,Ok thanx Take care then
3408,ham,Whats that coming over the hill..... Is it a monster! Hope you have a great day. Things r going ...,Whats that coming over the hill Is it a monster Hope you have a great day Things r going fine he...
3643,ham,* Thought I didn't see you.,Thought I didnt see you


In [48]:
import re

def tokenize(text):
    tokens = re.split("\W+", text)
    return tokens

In [49]:
messages["text_tokenized"] = messages["clean_text"].apply(lambda x : tokenize(x.lower()))
messages.sample(5)

Unnamed: 0,label,text,clean_text,text_tokenized
69,ham,I plane to give on this month end.,I plane to give on this month end,"[i, plane, to, give, on, this, month, end]"
4322,ham,Aight well keep me informed,Aight well keep me informed,"[aight, well, keep, me, informed]"
1874,spam,Would you like to see my XXX pics they are so hot they were nearly banned in the uk!,Would you like to see my XXX pics they are so hot they were nearly banned in the uk,"[would, you, like, to, see, my, xxx, pics, they, are, so, hot, they, were, nearly, banned, in, t..."
1399,ham,You have registered Sinco as Payee. Log in at icicibank.com and enter URN &lt;#&gt; to confirm...,You have registered Sinco as Payee Log in at icicibankcom and enter URN ltgt to confirm Beware...,"[you, have, registered, sinco, as, payee, log, in, at, icicibankcom, and, enter, urn, ltgt, to, ..."
695,ham,Good. Good job. I like entrepreneurs,Good Good job I like entrepreneurs,"[good, good, job, i, like, entrepreneurs]"


In [51]:
import nltk

stopwords = nltk.corpus.stopwords.words("english")
stopwords[0:5]

['i', 'me', 'my', 'myself', 'we']

In [52]:
def remove_stopwards(tokenized_text):
    text = [word for word in tokenized_text if word not in stopwords]
    return text

In [53]:
messages["text_nostop"] = messages["text_tokenized"].apply(lambda x : remove_stopwards(x))
messages.sample(5)

Unnamed: 0,label,text,clean_text,text_tokenized,text_nostop
132,ham,First answer my question.,First answer my question,"[first, answer, my, question]","[first, answer, question]"
1656,ham,Yes I posted a couple of pics on fb. There's still snow outside too. I'm just waking up :),Yes I posted a couple of pics on fb Theres still snow outside too Im just waking up,"[yes, i, posted, a, couple, of, pics, on, fb, theres, still, snow, outside, too, im, just, wakin...","[yes, posted, couple, pics, fb, theres, still, snow, outside, im, waking, ]"
801,ham,Appt is at &lt;TIME&gt; am. Not my fault u don't listen. I told u twice,Appt is at ltTIMEgt am Not my fault u dont listen I told u twice,"[appt, is, at, lttimegt, am, not, my, fault, u, dont, listen, i, told, u, twice]","[appt, lttimegt, fault, u, dont, listen, told, u, twice]"
2676,ham,* Am on a train back from northampton so i'm afraid not!,Am on a train back from northampton so im afraid not,"[, am, on, a, train, back, from, northampton, so, im, afraid, not]","[, train, back, northampton, im, afraid]"
2787,ham,"Forgot it takes me 3 years to shower, sorry. Where you at/your phone dead yet?",Forgot it takes me 3 years to shower sorry Where you atyour phone dead yet,"[forgot, it, takes, me, 3, years, to, shower, sorry, where, you, atyour, phone, dead, yet]","[forgot, takes, 3, years, shower, sorry, atyour, phone, dead, yet]"


In [54]:
!pip install scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.1.1-cp39-cp39-win_amd64.whl (7.4 MB)
Collecting scipy>=1.3.2
  Downloading scipy-1.9.0-cp39-cp39-win_amd64.whl (38.6 MB)
     ---------------------------------------- 38.6/38.6 MB 8.4 MB/s eta 0:00:00
Collecting threadpoolctl>=2.0.0
  Using cached threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Installing collected packages: threadpoolctl, scipy, scikit-learn
Successfully installed scikit-learn-1.1.1 scipy-1.9.0 threadpoolctl-3.1.0


In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(analyzer=remove_stopwards)

In [59]:
X_tfidf = tfidf_vect.fit_transform(messages["text_tokenized"])

In [60]:
X_tfidf.shape

(5572, 9395)

In [65]:
tfidf_vect.get_feature_names_out()

array(['', '0', '008704050406', ..., 'ûïharry', 'ûò', 'ûówell'],
      dtype=object)

In [67]:
X_tfidf

<5572x9395 sparse matrix of type '<class 'numpy.float64'>'
	with 50453 stored elements in Compressed Sparse Row format>

In [68]:
X_features = pd.DataFrame(X_tfidf.toarray())
X_features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9385,9386,9387,9388,9389,9390,9391,9392,9393,9394
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5568,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5570,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [69]:
from sklearn.ensemble import RandomForestClassifier

In [73]:
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import train_test_split

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X_features, messages["label"], test_size=0.2)

In [75]:
rf = RandomForestClassifier()

In [76]:
rf_model = rf.fit(X_train, y_train)

In [77]:
y_pred = rf_model.predict(X_test)

In [80]:
precision = precision_score(y_test, y_pred, pos_label="spam")
recall = recall_score(y_test, y_pred, pos_label="spam")

In [88]:
print(f"Precision {round(precision, 3)} \nRecall {round(recall, 3)}")                                          

Precision 1.0 
Recall 0.784


In [None]:
 print(f"Hello, My name is {name} and I'm {age} years old.")

In [72]:
print(RandomForestClassifier())

RandomForestClassifier()


In [66]:
tfidf_vect.get_feature_names()

['',
 '0',
 '008704050406',
 '0089my',
 '0121',
 '01223585236',
 '01223585334',
 '0125698789',
 '02',
 '020603',
 '0207',
 '02070836089',
 '02072069400',
 '02073162414',
 '02085076972',
 '020903',
 '021',
 '050703',
 '0578',
 '06',
 '060505',
 '061104',
 '07008009200',
 '07046744435',
 '07090201529',
 '07090298926',
 '07099833605',
 '071104',
 '07123456789',
 '0721072',
 '07732584351',
 '07734396839',
 '07742676969',
 '07753741225',
 '0776xxxxxxx',
 '07786200117',
 '077xxx',
 '078',
 '07801543489',
 '07808',
 '07808247860',
 '07808726822',
 '07815296484',
 '07821230901',
 '0784987',
 '0789xxxxxxx',
 '0794674629107880867867',
 '0796xxxxxx',
 '07973788240',
 '07xxxxxxxxx',
 '0800',
 '08000407165',
 '08000776320',
 '08000839402',
 '08000930705',
 '08000938767',
 '08001950382',
 '08002888812',
 '08002986030',
 '08002986906',
 '08002988890',
 '08006344447',
 '0808',
 '08081263000',
 '08081560665',
 '0825',
 '0844',
 '08448350055',
 '08448714184',
 '0845',
 '08450542832',
 '08452810071',
 '0