# Exploring Text Data

In [3]:
import os
data_folder = os.path.join('..','data','recipes')
all_recipe_files = [os.path.join(data_folder,fname) for fname in os.listdir(data_folder)]

documents = {}
for recipe_name in all_recipe_files:
    bname = os.path.basename(recipe_name)
    recipe_number = os.path.splitext(bname)[0]
    with open(recipe_name,'r') as f:
        documents[recipe_number] = f.read()

corpus_all_in_one = ' '.join([doc for doc in documents.values()])

print("Number of docs: {}".format(len(documents)))
print("Corpus size (char): {}".format(len(corpus_all_in_one)))

Number of docs: 220
Corpus size (char): 161170


# Tokenization

In [4]:
from nltk.tokenize import word_tokenize

try:
    all_tokens = [t for t in word_tokenize(corpus_all_in_one)]
except UnicodeDecodeError:
    all_tokens = [t for t in word_tokenize(corpus_all_in_one.decode('utf-8'))]
    
print("Total number of tokens: {}".format(len(all_tokens)))

Total number of tokens: 33719


# Counting Words

In [5]:
from collections import Counter

total_term_frequency = Counter(all_tokens)

for word, freq in total_term_frequency.most_common(20):
    print("{}\t{}".format(word,freq))

the	1933
,	1726
.	1568
and	1435
a	1076
of	988
in	811
with	726
it	537
to	452
or	389
is	337
(	295
)	295
be	266
them	248
butter	231
on	220
water	205
little	198


In [6]:
document_frequency = Counter()
for recipe_number, content in documents.items():
    tokens = word_tokenize(content)
    unique_tokens = set(tokens)
    document_frequency.update(unique_tokens)

for word,freq in document_frequency.most_common(20):
    print("{}\t{}".format(word,freq))

and	220
.	220
,	219
)	218
(	218
the	217
in	215
of	210
a	210
with	203
it	167
to	165
or	165
is	145
salt	142
butter	137
on	136
be	133
put	126
water	125


# Stop words

In [7]:
from nltk.corpus import stopwords
import string

print(stopwords.words('english'))
print(len(stopwords.words('english')))
print(string.punctuation)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [9]:
stop_list = stopwords.words('english') + list(string.punctuation)

tokens_no_stop = [token for token in all_tokens if token not in stop_list]

total_term_frequency_no_stop = Counter(tokens_no_stop)

for word, freq in total_term_frequency_no_stop.most_common(20):
    print("{}\t{}".format(word,freq))

butter	231
water	205
little	198
put	197
one	186
salt	185
fire	169
half	169
two	157
When	132
sauce	128
pepper	128
add	125
cut	125
flour	116
piece	116
The	111
sugar	100
saucepan	100
oil	99


In [11]:
print(total_term_frequency_no_stop['olives'])

3


# Text Normalisation

Replacing tokens with a canonical form, so we can group togather different spelling/variations of the same word

Stemming is the process of reducing a word to its bases/root form, called stem.

In [12]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
all_tokens_lower = [t.lower() for t in all_tokens]

tokens_normalized = [stemmer.stem(t) for t in all_tokens_lower if t not in stop_list]

total_term_frequency_normalised = Counter(tokens_normalized)

for word, freq in total_term_frequency_normalised.most_common(20):
    print("{}\t{}".format(word,freq))

put	286
butter	245
salt	215
piec	211
one	210
water	209
cook	208
littl	198
cut	175
half	170
brown	169
fire	169
egg	163
two	162
add	160
boil	154
sauc	152
pepper	130
serv	128
remov	127


# N - gram
Stop - word removal will affect n-gram
e.g., phrases like " a pinck of salt" become "pinch salt" after stop-word removal

In [15]:
from nltk import ngrams
phrases = Counter(ngrams(all_tokens_lower,2))
for phrase,freq in phrases.most_common(20):
    print("{}\t{}".format(phrase,freq))

('in', 'the')	175
('in', 'a')	172
('of', 'the')	153
('with', 'a')	142
('.', 'when')	131
('the', 'fire')	129
('on', 'the')	128
(',', 'and')	117
('with', 'the')	117
('salt', 'and')	113
('it', 'is')	109
('a', 'little')	107
('piece', 'of')	102
('and', 'a')	102
('of', 'butter')	94
('and', 'pepper')	87
('.', 'the')	85
('and', 'the')	84
('when', 'the')	82
('with', 'salt')	80


In [16]:
phrases = Counter(ngrams(all_tokens_lower,3))
for phrase, freq in phrases.most_common(20):
    print("{}\t{}".format(phrase,freq))

('on', 'the', 'fire')	90
('salt', 'and', 'pepper')	84
('piece', 'of', 'butter')	73
('a', 'piece', 'of')	63
('with', 'salt', 'and')	62
('.', 'when', 'the')	59
('a', 'pinch', 'of')	45
('in', 'a', 'saucepan')	45
('season', 'with', 'salt')	42
('the', 'fire', 'with')	41
('when', 'it', 'is')	39
('and', 'pepper', '.')	37
('through', 'a', 'sieve')	36
('complete', 'the', 'cooking')	34
('and', 'a', 'half')	33
('of', 'butter', ',')	27
('a', 'taste', 'of')	26
('it', 'on', 'the')	26
('and', 'when', 'it')	26
(',', 'salt', 'and')	25


In [17]:
phrases = Counter(ngrams(tokens_no_stop,2))

for phrase, freq in phrases.most_common(20):
    print("{}\t{}".format(phrase,freq))

('salt', 'pepper')	106
('piece', 'butter')	73
('grated', 'cheese')	55
('bread', 'crumbs')	34
('tomato', 'sauce')	32
('put', 'fire')	32
('complete', 'cooking')	31
('thin', 'slices')	29
('brown', 'stock')	29
('season', 'salt')	29
('olive', 'oil')	26
('low', 'fire')	25
('chopped', 'fine')	25
('boiling', 'water')	22
('little', 'pieces')	22
('half', 'ounces')	21
('one', 'two')	18
('half', 'cooked')	18
('two', 'ounces')	18
('lemon', 'peel')	18


In [19]:
phrases = Counter(ngrams(tokens_normalized,3))

for phrase, freq in phrases.most_common(20):
    print("{}\t{}".format(phrase,freq))

('season', 'salt', 'pepper')	57
('bread', 'crumb', 'ground')	13
('cut', 'thin', 'slice')	13
('tast', 'lemon', 'peel')	12
('pinch', 'grate', 'chees')	11
('sprinkl', 'bread', 'crumb')	11
('good', 'oliv', 'oil')	10
('greas', 'butter', 'sprinkl')	10
('small', 'piec', 'butter')	10
('saucepan', 'piec', 'butter')	9
('piec', 'butter', 'brown')	9
('anoth', 'piec', 'butter')	9
('cut', 'littl', 'piec')	9
('crumb', 'ground', 'fine')	9
('cut', 'small', 'piec')	9
('half', 'inch', 'thick')	9
('medium', 'size', 'onion')	9
('ounc', 'sweet', 'almond')	9
('tomato', 'sauc', '12')	8
('littl', 'piec', 'butter')	8
