In [1]:
import pandas as pd
import numpy as np
import nltk
import os
import nltk.corpus

# Load text file

In [2]:
filename = '/Users/david.vesely/tmp/testH.text'

In [3]:
with open(filename, 'r') as f:
    lines = f.readlines()

# Tokenization

In [15]:
from nltk.tokenize import word_tokenize

tokens = []
for line in lines:
    tokens += word_tokenize(line)

In [16]:
len(tokens), len(set(tokens))

(162246, 16038)

# Stemming

In [17]:
from nltk.stem import PorterStemmer

pst = PorterStemmer()
stemmed = [pst.stem(token) for token in tokens]

In [18]:
len(stemmed), len(set(stemmed))

(162246, 10143)

# Lematization

In [20]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer() 
lemmatized = [lemmatizer.lemmatize(token) for token in tokens]

In [22]:
len(lemmatized), len(set(lemmatized))

(162246, 14492)

# Put process to pandas

In [26]:
df = pd.DataFrame(zip(tokens, stemmed, lemmatized), columns=['token', 'stemmed', 'lemmatized'])

In [28]:
df.head()

Unnamed: 0,token,stemmed,lemmatized
0,[,[,[
1,],],]
2,[,[,[
3,],],]
4,English,english,English


# Remove punctuation

In [66]:
df.iloc[4590, 0][-1]

'¹'

In [59]:
df[(df['lemmatized'].str.contains('[0-9]')) & (df['lemmatized'].apply(len) > 2)]

Unnamed: 0,token,stemmed,lemmatized
8,2014,2014,2014
16,2014,2014,2014
33,2011,2011,2011
172,978-0-7710-3850-1,978-0-7710-3850-1,978-0-7710-3850-1
177,978-0-7710-3852-5,978-0-7710-3852-5,978-0-7710-3852-5
...,...,...,...
162101,an23378504,an23378504,an23378504
162114,11267,11267,11267
162133,249,249,249
162158,1916,1916,1916


# Removing stopwords

In [32]:
from nltk import word_tokenize
from nltk.corpus import stopwords


stopwords = set(stopwords.words('english'))
removed_stopwords = df[(~df['lemmatized'].str.lower().isin(stopwords)) & (df['lemmatized'].apply(len) > 2)]

# [x for x in lemmatized if x.lower() not in stopwords and len(x) > 2]
# removed_stopwords[:20]
removed_stopwords.head()

Unnamed: 0,token,stemmed,lemmatized
4,English,english,English
5,translation,translat,translation
6,copyright,copyright,copyright
8,2014,2014,2014
10,Yuval,yuval,Yuval


In [35]:
removed_stopwords.sort_values('token')

Unnamed: 0,token,stemmed,lemmatized
159454,.pdf,.pdf,.pdf
176,.–ISBN,.–isbn,.–ISBN
158423,//files.shareholder.com/downloads/RBS/626570033,//files.shareholder.com/downloads/rbs/626570033,//files.shareholder.com/downloads/RBS/626570033
161358,//news.bbc.c0.uk/2/hi/8164060.stm,//news.bbc.c0.uk/2/hi/8164060.stm,//news.bbc.c0.uk/2/hi/8164060.stm
161462,//news.bbc.co.Uk/2/hi/health/7954968.stm,//news.bbc.co.uk/2/hi/health/7954968.stm,//news.bbc.co.Uk/2/hi/health/7954968.stm
...,...,...,...
93367,£25,£25,£25
93422,"£58,347","£58,347","£58,347"
93402,"£58,348","£58,348","£58,348"
37757,Çatalhöyük,çatalhöyük,Çatalhöyük


# Normalize

In [10]:
normalized = [x.lower() for x in removed_stopwords]

# Frequency of tokens

In [11]:
from nltk.probability import FreqDist
frequencies = FreqDist(normalized)
frequencies

FreqDist({'human': 605, 'people': 511, 'world': 403, 'one': 387, 'year': 378, 'even': 373, 'could': 339, 'would': 338, 'new': 327, 'empire': 302, ...})

In [12]:
frequencies.most_common(10)

[('human', 605),
 ('people', 511),
 ('world', 403),
 ('one', 387),
 ('year', 378),
 ('even', 373),
 ('could', 339),
 ('would', 338),
 ('new', 327),
 ('empire', 302)]

# Load Dictionary

In [13]:
import pandas as pd

dictionary_filename = '/Users/david.vesely/tmp/dictionary.csv'
dictionary = pd.read_csv(dictionary_filename)

In [14]:
dictionary.columns = ['word', 'type', 'explanation']

In [15]:
dictionary['word'] = dictionary['word']\
    .apply(lambda x: x.replace('"', ''))\
    .str.lower()

In [16]:
dictionary = dictionary.groupby('word')\
    .first()\
    .reset_index()\
    .sort_values('word')

# Merge frequencies with dictionary

In [17]:
frequencies_df = pd.DataFrame(frequencies.items(), columns=['word', 'frequency'])

In [18]:
merged = frequencies_df.merge(dictionary, on='word', how='inner')

In [19]:
merged.sort_values('frequency')

Unnamed: 0,word,frequency,type,explanation
1261,validity,1,n.,The quality or state of being valid; strength;...
1686,paean,1,n.,An ancient Greek hymn in honor of Apollo as a ...
1680,macedonian,1,a.,"Belonging, or relating, to Macedonia."
1678,unquestioned,1,a.,Not called in question; not doubted.
1672,heartless,1,a.,Without a heart.
...,...,...,...,...
2,first,225,a.,Preceding all others of a series or kind; the ...
16,history,266,n.,A learning or knowing by inquiry; the knowledg...
189,new,327,superl.,"Having existed, or having been made, but a sho..."
64,year,378,n.,The time of the apparent revolution of the sun...


In [21]:
result_filename = '/Users/david.vesely/tmp/result_01.csv'
merged.to_csv(result_filename)