# NLP Basics: Reading in text data & why do we need to clean the text?


### Read in semi-structured text data


In [1]:
# Read in the raw text
rawData = open("SMSSpamCollection.tsv").read()
# Print the raw data
rawData[0:500]


"ham\tI've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.\nspam\tFree entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's\nham\tNah I don't think he goes to usf, he lives around here though\nham\tEven my brother is not like to speak with me. They treat me like aid"

### Parse data, replace '\t' with '\n' and split each line 

In [5]:
parsedData = rawData.replace('\t', '\n').split('\n')
parsedData[0:5]


['ham',
 "I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.",
 'spam',
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
 'ham']

### Create list of labels (rows 1,3,...) and texts (rows 2,4,...) 

In [9]:
# get all rows starting form 0 to end (no number) taking every second row
labelList = parsedData[0::2]
# get all rows starting form 1 to end (no number) taking every second row
textList = parsedData[1::2]
# print lists
print(labelList[0:5])
print(textList[0:5])


['ham', 'spam', 'ham', 'ham', 'ham']
["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.", "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", "Nah I don't think he goes to usf, he lives around here though", 'Even my brother is not like to speak with me. They treat me like aids patent.', 'I HAVE A DATE ON SUNDAY WITH WILL!!']


## Create pandas dataframe 

In [10]:
import pandas as pd

fullCorpus = pd.DataFrame( {
    'label' : labelList,
    'body_list' : textList
})

fullCorpus.head()
# -> gibt Fehler, da arrays nicht die gleiche länge haben!



ValueError: arrays must all be same length

### Check len of lists 

In [13]:
print( len(labelList) )
print( len(textList) )

# print the last 5 items
print ( labelList[-5:] )

# create data frame by dropping the last label item
fullCorpus = pd.DataFrame( {
    'label' : labelList[:-1],
    'body_list' : textList
})
fullCorpus.head()


5571
5570
['ham', 'ham', 'ham', 'ham', '']


Unnamed: 0,label,body_list
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


## Create pandas dataframe directly from CSV 

In [14]:
dataset = pd.read_csv("SMSSpamCollection.tsv", sep='\t', header=None)
dataset.head()

Unnamed: 0,0,1
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!
