### What this script does:

- remove stopwords: English & French stopwords are loaded from two txt files into two sets. When a comment/post is detected as in French, detectable French stopwords will be removed. Otherwise, English stopwords will be removed. Note: stopwords list need to be updated, for example, if we need to find out about the attitude/sentiment of comments, we should probably exclude words such as 'couldn't', 'cannot', etc. from this list.


- remove non ascii characters: after this step, only digits, English & French characters are kept. Turn is off if it's unncecassry or removes too much implicit information. 


- tokenization: With MWETokenizer, multi-word tokens can be added based on our needs, for example, I added 'climate change', 'canada150', 'justin trudeau' as custermized tokens and count their frequency in later step. Note: there are duplicated tokens in each comment/post.


- stemming: stem the tokens obtain from last step.


- ouput tokens' & stemmers' frequency distribution: output csv files can be found in folder 'word_count', frequency of tokens and stemmers are listed in descending order. The freq_perc colomn is obtained by dividing the frequency of a word by the total number of comment/posts in each original csv file. Note: a word/token/stemmer can appear multiple times in a single comment/post, we can choose to count such a word to appear once or multiple times  per comment/post, according to our needs.  

In [1]:
import glob
import os
import pandas as pd
from langdetect import detect
import nltk
#nltk.download()   # comment after first download
from nltk.tokenize import MWETokenizer
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import FrenchStemmer
from nltk.stem.snowball import EnglishStemmer
from nltk.probability import FreqDist
#from nltk.stem.wordnet import WordNetLemmatizer

## Choose a path

In [2]:
#======= Twitter =======
rootPath='../tw/cleaned data/*.csv'
COLUMN_NAME = 'full_text_cleaned'

#======= Facebook =======
#rootPath='../fb/cleaned data/comments/*.csv'
#COLUMN_NAME = 'comment_message_cleaned'

#======= Instagram =======
#rootPath='../in/cleaned data/posts/*.csv'
#COLUMN_NAME = 'caption'


stopWordsPath_en = './stopwords_en.txt'
stopWordsPath_fr = './stopwords_fr.txt'
topicWordsPath = './topicwords.txt'

In [3]:
outputDir = rootPath[:-5] + 'word_count/'
if not os.path.exists(outputDir):
    os.makedirs(outputDir)
filePaths = glob.glob(rootPath)  
display(filePaths)

['../tw/cleaned data/ParksCanada_tweets.csv',
 '../tw/cleaned data/ParcsCanada_tweets.csv']

['ParksCanada_tweets.csv', 'ParcsCanada_tweets.csv']

In [4]:
testfile = '../tw/cleaned data/ParksCanada_tweets.csv'
filename = os.path.basename(testfile)
outputFileName = filename[:-4] + '_word_count.csv'
display(outputFileName)


'ParksCanada_tweets_word_count.csv'

In [5]:
def detect_lang(text):
    try:
        lang = detect(text)
    except:
        return 'error'
    return lang


def load_stop_words(filepath):
    with open(filepath) as file:
        lines = file.readlines()
        words = [token.strip() for token in lines]    # .encode('utf8')
        print('>>>Loading stopwords... ')
        #print(words)
        return words

    
def filter_stop_words(text):   ##  Why not working????????????
    stopWords = stopWords_en  # use global set variable 
    if detect_lang(text) == 'fr':
        stopWords = stopWords_fr
    filtered_text = []
    for word in text.split():
        if word.lower() not in stopWords: 
            #print(word.lower())
            filtered_text.append(word.strip())
    return ' '.join(filtered_text)


def filter_nonascii_chars(text):   #some non ascii chars are important! 
    keep_chars = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
                  'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 
                  'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 
                  'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 
                  'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 
                  'é', 'à', 'è', 'ù', 'â', 'ê', 'î', 'ô', 'û', 'ç', 'ë', 'ï', 'ü', 
                  ' ', '\''}
    filtered_text = []
    for word in text:
        if word in keep_chars:  
            filtered_text.append(word)
    return ''.join(filtered_text)


def load_topic_words(filepath):
    with open(filepath) as file:
        lines = file.readlines()
        words = [word.strip() for word in lines]
        return words


def tokenize_topic_words(topic_list):
    result = []
    print('>>>Adding custermized topic words/tokens to tokenizer...')
    for words in topic_list:
        print(words)
        result.append(words.split('_'))
    return result


def tokenize_text(text):
    return tokenizer.tokenize(text.lower().split()) 


def stem_text(text):
    print(text)
        # Set ignore_stopwords = False if want to customize stopwords
    if detect_lang(text) == 'fr':
        stemmer = FrenchStemmer(ignore_stopwords=True)
    else:
        stemmer = EnglishStemmer(ignore_stopwords=True) 
    stems = [stemmer.stem(tok) for tok in text]
    print(stems)
    return stems


"""
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(tok) for tok in text]
    print(lemmas)
    return lemmas
"""

'\ndef lemmatize_text(text):\n    lemmatizer = WordNetLemmatizer()\n    lemmas = [lemmatizer.lemmatize(tok) for tok in text]\n    print(lemmas)\n    return lemmas\n'

In [6]:
stopWords_en = set(load_stop_words(stopWordsPath_en))
stopWords_fr = set(load_stop_words(stopWordsPath_fr))

>>>Loading stopwords... 
>>>Loading stopwords... 


In [7]:
df = pd.read_csv(testfile)   # the first unnamed column already exists in csv file 
#df

In [8]:
s = df[COLUMN_NAME][8]
print(s)
stopWords = stopWords_en  # use global set variable 
if detect_lang(s) == 'fr':
    stopWords = stopWords_fr
print(stopWords)
for word in s.split():
    w = word.lower().strip()
    print(w)
    print(w in stopWords)

It’s , which means the year is coming to an end… ⏰ Our is to spend as much time in the great outdoors as possible in , how about you?
{'under', "aren't", "wasn't", 'then', 'more', 'all', "couldn't", 'the', 'which', "they've", "don't", 'i', 'their', 'to', 'them', 'you', 'further', 'down', 'each', "i'll", 'are', 'both', 'an', 'as', "i'm", "she'd", 'below', 'been', 'be', "shan't", "who's", 'had', "he'd", 'its', 'once', "doesn't", 'or', 'a', "hadn't", "weren't", 'at', "here's", 'other', 'him', 'too', 'we', 'those', 'he', 'these', "didn't", "they'd", 'yours', "you'd", 'hers', "it's", 'very', 'for', 'when', "what's", 'any', 'yourself', 'because', 'again', "wouldn't", 'having', 'they', 'has', 'her', 'am', 'ought', 'why', "you'll", 'where', 'on', "he's", 'himself', "shouldn't", 'did', 'so', 'while', "hasn't", 'only', "where's", "you're", 'myself', 'she', "you've", 'in', 'were', "let's", 'it', 'your', 'me', 'of', 'after', "she'll", "isn't", 'cannot', 'between', 'does', "that's", "i've", 'about'

In [9]:
df['text_filtered'] = df[COLUMN_NAME].astype(str)
df['text_filtered1'] = df['text_filtered'].apply(filter_stop_words)
df['text_filtered2'] = df['text_filtered1'].apply(filter_nonascii_chars)
df['text_filtered3'] = df['text_filtered2'].apply(filter_stop_words)
df[['text_filtered1', 'text_filtered2', 'text_filtered3']]

Unnamed: 0,text_filtered1,text_filtered2,text_filtered3
0,Immerse true winter wilderness inspirational s...,Immerse true winter wilderness inspirational s...,Immerse true winter wilderness inspirational s...
1,? Book early! reservations start soon!,Book early reservations start soon,Book early reservations start soon
2,"Put Canadian twist plans, cue stunning music v...",Put Canadian twist plans cue stunning music vi...,Put Canadian twist plans cue stunning music vi...
3,"know you’ve great year, tell us it! Share best...",know youve great year tell us it Share best ee...,know youve great year tell us Share best eerience
4,starry backcountry nights cozy rustic shelters...,starry backcountry nights cozy rustic shelters...,starry backcountry nights cozy rustic shelters...
5,"Looking getaway? Four hours Toronto, unwind am...",Looking getaway Four hours Toronto unwind amon...,Looking getaway Four hours Toronto unwind amon...
6,"Family/Group % December , ? Don’t miss opportu...",FamilyGroup December Dont miss opportunity ...,FamilyGroup December Dont miss opportunity sav...
7,"full want hear it! final , want show us best e...",full want hear it final want show us best eer...,full want hear final want show us best eerience
8,"It’s , means year coming end… ⏰ spend much tim...",Its means year coming end spend much time gr...,means year coming end spend much time great ou...
9,bring new gear? thing left book – reservations...,bring new gear thing left book reservations s...,bring new gear thing left book reservations st...


## Load multi-word tokens, tokenize comments/posts, and stem tokens

In [10]:
topicWords = load_topic_words(topicWordsPath)       # load custermized tokens
tokenizedTopicWords = tokenize_topic_words(topicWords)
tokenizer = MWETokenizer(tokenizedTopicWords)
#tokenizer = MWETokenizer()    # Uncomment this line if no customized multi-word tokens needed

df['text_tokenized'] = df['text_filtered2'].apply(tokenize_text)
df['text_stemmed'] = df['text_tokenized'].apply(stem_text)
df

>>>Adding custermized topic words/tokens to tokenizer...
climate_change
canada150
justin_trudeau
['immerse', 'true', 'winter', 'wilderness', 'inspirational', 'surroundings', 'yukon', 'canadas', 'largest', 'icefield', 'kluane', 'np', 'npr', '1']
['immers', 'true', 'winter', 'wilder', 'inspir', 'surround', 'yukon', 'canada', 'largest', 'icefield', 'kluan', 'np', 'npr', '1']
['book', 'early', 'reservations', 'start', 'soon']
['book', 'earli', 'reserv', 'start', 'soon']
['put', 'canadian', 'twist', 'plans', 'cue', 'stunning', 'music', 'video', 'performed', 'breathtaking', 'scenery', 'locations', '8']
['put', 'canadian', 'twist', 'plan', 'cue', 'stun', 'music', 'video', 'perform', 'breathtak', 'sceneri', 'locat', '8']
['know', 'youve', 'great', 'year', 'tell', 'us', 'it', 'share', 'best', 'eerience']
['know', 'youv', 'great', 'year', 'tell', 'us', 'it', 'share', 'best', 'eerienc']
['starry', 'backcountry', 'nights', 'cozy', 'rustic', 'shelters', 'weve', 'got', 'adventures', 'help', 'get', '

['there', 'snow', 'much', 'fun', 'park', 'site', 'winter', 'month', 'check', 'idea']
['happy', 'think', 'mountain', 'views', 'best', 'around', 'you', 'share', 'photos', '2']
['happi', 'think', 'mountain', 'view', 'best', 'around', 'you', 'share', 'photo', '2']
['looking', 'family', 'fun', 'winter', 'weve', 'rounded', 'ten', 'places', 'offering', 'super', 'fun', 'snowfilled', 'adventures', '0']
['look', 'famili', 'fun', 'winter', 'weve', 'round', 'ten', 'place', 'offer', 'super', 'fun', 'snowfil', 'adventur', '0']
['channel', 'check', 'playlist', 'see', 'were', 'protect', 'places']
['channel', 'check', 'playlist', 'see', 'were', 'protect', 'place']
['youre', 'never', 'youngor', 'oldto', 'enjoy', 'giddy', 'fastpaced', 'thrills', 'ride', 'hop', 'toboggan']
['your', 'never', 'youngor', 'oldto', 'enjoy', 'giddi', 'fastpac', 'thrill', 'ride', 'hop', 'toboggan']
['its', 'almost', 'last', 'day', 'order', 'arrive', 'grab', 'last', 'minute', 'gifts', 'c']
['its', 'almost', 'last', 'day', 'order'

['sunday', 'park', 'without', 'furri', 'companion', 'dont', 'forget', 'week', 'show', 'us']
['knew', 'coast', 'salish', 'peoples', 'clam', 'gardens', 'pantries', 'classrooms', 'learn', 'more']
['knew', 'coast', 'salish', 'peopl', 'clam', 'garden', 'pantri', 'classroom', 'learn', 'more']
['instagram', 'follow', 'us', 'see', 'amazing', 'places', 'forget', 'tag', 'chance', 'featured', 'u']
['instagram', 'follow', 'us', 'see', 'amaz', 'place', 'forget', 'tag', 'chanc', 'featur', 'u']
['bc', 'became', 'british', 'colony', 'ceremony', 'took', 'place', 'fort', 'langley', 'nhs', 'gun', 'salute', 'u']
['bc', 'becam', 'british', 'coloni', 'ceremoni', 'took', 'place', 'fort', 'langley', 'nhs', 'gun', 'salut', 'u']
['casting', 'now', 'open', 'season', 'might', 'even', 'get', 'see', 'places', 'apply', 'today']
['cast', 'now', 'open', 'season', 'might', 'even', 'get', 'see', 'place', 'appli', 'today']
['great', 'see', 'amazing', 'players', 'teams', 'plaque', 'unveiling', 'yvan', 'cournoyer', 'dave',

['week', 'handl', 'panhandl', 'cndus', 'border', 'disput', 'c']
['week', 'handling', 'panhandle', 'cdnus', 'border', 'dispute', 'c', 'c']
['week', 'handl', 'panhandl', 'cdnus', 'border', 'disput', 'c', 'c']
['proposed', 'marine', 'area', 'tallurutiup', 'imanga', 'home', 'worlds', '8']
['propos', 'marin', 'area', 'tallurutiup', 'imanga', 'home', 'world', '8']
['snapping', 'turtle', 'one', 'speciesatrisk', 'park', 'new', 'ecopassages', 'will', 'offer', 'critter', 'safe', 'way', 'to']
['snap', 'turtl', 'one', 'speciesatrisk', 'park', 'new', 'ecopassag', 'will', 'offer', 'critter', 'safe', 'way', 'to']
['its', 'canada', 'want', 'hear', 'you', 'share', 'reduce', 'waste', 'visiting', 'sites', 'amp', 'parks']
['its', 'canada', 'want', 'hear', 'you', 'share', 'reduc', 'wast', 'visit', 'site', 'amp', 'park']
['prayer', 'book', 'given', 'isadore', 'dumas', 'fought', 'alongside', 'louis', 'riel', 'northwest', 'resistance']
['prayer', 'book', 'given', 'isador', 'duma', 'fought', 'alongsid', 'loui'

['youve', 'seen', 's', 'tides', 'day', 'seen', 'dark', 'sky', 'night', 'check', 'september']
['youv', 'seen', 's', 'tide', 'day', 'seen', 'dark', 'sky', 'night', 'check', 'septemb']
['looking', 'learn', 'war', 'historic', 'site', 'that', 'check', 'queenston', 'heights', 'nhs']
['look', 'learn', 'war', 'histor', 'site', 'that', 'check', 'queenston', 'height', 'nhs']
['check', 'latest', 'update']
['check', 'latest', 'updat']
['fast', 'approaching', 'take', 'time', 'plan', 'dream', 'adventure', 'u']
['fast', 'approach', 'take', 'time', 'plan', 'dream', 'adventur', 'u']
['kenow', 'fire', 'information', 'update', 'unified', 'command', 'september', '15']
['kenow', 'fire', 'inform', 'updat', 'unifi', 'command', 'septemb', '15']
['escape', 'hustle', 'bustle', 'plan', 'weekend', 'getaway']
['escap', 'hustl', 'bustl', 'plan', 'weekend', 'getaway']
['extensive', 'fire', 'protection', 'measures', 'taken', 'critical', 'infrastructure', 'park', 'resources', 'community']
['extens', 'fire', 'protect',

['aboard', 'watch', 'tall', 'ship', 'sail', 'water', 'weekend']
['please', 'note', 'parks', 'canadas', 'website', 'will', 'unavailable', '00', 'am', '00', 'pm', 'est', 'sunday', 'august', 'th']
['pleas', 'note', 'park', 'canada', 'websit', 'will', 'unavail', '00', 'am', '00', 'pm', 'est', 'sunday', 'august', 'th']
['tallurutiup', 'imanga', 'key', 'feeding', 'area', 'hundreds', 'thousands', 'sea', 'birds']
['tallurutiup', 'imanga', 'key', 'feed', 'area', 'hundr', 'thousand', 'sea', 'bird']
['recordbreaking', 'whooping', 'crane', 'chicks', 'counted', 'making', 'history']
['recordbreak', 'whoop', 'crane', 'chick', 'count', 'make', 'histori']
['gateway', 'great', 'prizes', 'enter', 'now', 'amp', 'win', 'epic', 'trip', '8']
['gateway', 'great', 'prize', 'enter', 'now', 'amp', 'win', 'epic', 'trip', '8']
['snapped', 'couple', 'autumns', 'ago', 'still', 'one', 'favourite', 'photos', 'ht']
['snap', 'coupl', 'autumn', 'ago', 'still', 'one', 'favourit', 'photo', 'ht']
['nights', 'skies', 'light'

Unnamed: 0.1,Unnamed: 0,id,retweet_count,favorite_count,created_at,full_text_cleaned,hashtags,urls,text_filtered,text_filtered1,text_filtered2,text_filtered3,text_tokenized,text_stemmed
0,0,947542990825848832,9,37,2017-12-31 19:00:28,Immerse yourself in a true winter wilderness i...,"['#WinterCamping', '#ExploreYukon']","['https://t.co/82ikpjBdX', 'https://t.co/xNSQv...",Immerse yourself in a true winter wilderness i...,Immerse true winter wilderness inspirational s...,Immerse true winter wilderness inspirational s...,Immerse true winter wilderness inspirational s...,"[immerse, true, winter, wilderness, inspiratio...","[immers, true, winter, wilder, inspir, surroun..."
1,1,947512987694321664,6,21,2017-12-31 17:01:14,? Book early! Our reservations start soon!,"['#NewYearResolution', '#2018camping', '#PlanA...","['https://t.co/ePsOQZnx5q', 'https://t.co/v3o7...",? Book early! Our reservations start soon!,? Book early! reservations start soon!,Book early reservations start soon,Book early reservations start soon,"[book, early, reservations, start, soon]","[book, earli, reserv, start, soon]"
2,2,947225931952939009,14,44,2017-12-30 22:00:35,"Put a Canadian twist on your plans, and cue up...","['#NewYearsEve', '#AuldLangSyne', '#CapeSpearN...",['https://t.co/1mhZwqP9A'],"Put a Canadian twist on your plans, and cue up...","Put Canadian twist plans, cue stunning music v...",Put Canadian twist plans cue stunning music vi...,Put Canadian twist plans cue stunning music vi...,"[put, canadian, twist, plans, cue, stunning, m...","[put, canadian, twist, plan, cue, stun, music,..."
3,3,947150587883347969,7,33,2017-12-30 17:01:12,We know you’ve had some great in our this year...,"['#adventures', '#places', '#ParksCanada', '#C...",['https://t.co/pPqT7pWZAq'],We know you’ve had some great in our this year...,"know you’ve great year, tell us it! Share best...",know youve great year tell us it Share best ee...,know youve great year tell us Share best eerience,"[know, youve, great, year, tell, us, it, share...","[know, youv, great, year, tell, us, it, share,..."
4,4,947120242572054533,7,29,2017-12-30 15:00:37,From starry backcountry nights to cozy rustic ...,"['#WinterCamping', '#campvibes']","['https://t.co/82ikpjBdX', 'https://t.co/OCf86...",From starry backcountry nights to cozy rustic ...,starry backcountry nights cozy rustic shelters...,starry backcountry nights cozy rustic shelters...,starry backcountry nights cozy rustic shelters...,"[starry, backcountry, nights, cozy, rustic, sh...","[starri, backcountri, night, cozi, rustic, she..."
5,5,946848436569149442,7,28,2017-12-29 21:00:33,"Looking for getaway? Four hours from Toronto, ...","['#WinterCamping', '#YYZ']","['https://t.co/82ikpjBdX', 'https://t.co/RosQB...","Looking for getaway? Four hours from Toronto, ...","Looking getaway? Four hours Toronto, unwind am...",Looking getaway Four hours Toronto unwind amon...,Looking getaway Four hours Toronto unwind amon...,"[looking, getaway, four, hours, toronto, unwin...","[look, getaway, four, hour, toronto, unwind, a..."
6,6,946818267636695043,6,9,2017-12-29 19:00:40,"our Family/Group is % off until December , ? D...","['#DidYouKnow', '#DiscoveryPass']","['https://t.co/U8DWrr6', 'https://t.co/WKkwA9X...","our Family/Group is % off until December , ? D...","Family/Group % December , ? Don’t miss opportu...",FamilyGroup December Dont miss opportunity ...,FamilyGroup December Dont miss opportunity sav...,"[familygroup, december, dont, miss, opportunit...","[familygroup, decemb, dont, miss, opportun, sa..."
7,7,946788104144281605,11,72,2017-12-29 17:00:49,was full of and and we want to hear about it! ...,"['#Canada150', '#adventure', '#discovery', '#P...",['https://t.co/2mC3oZDW2n'],was full of and and we want to hear about it! ...,"full want hear it! final , want show us best e...",full want hear it final want show us best eer...,full want hear final want show us best eerience,"[full, want, hear, it, final, want, show, us, ...","[full, want, hear, it, final, want, show, us, ..."
8,8,946764037966180354,16,34,2017-12-29 15:25:11,"It’s , which means the year is coming to an en...","['#TickTockDay', '#NewYearsResolution', '#GetO...",['https://t.co/bQCROMJCuN'],"It’s , which means the year is coming to an en...","It’s , means year coming end… ⏰ spend much tim...",Its means year coming end spend much time gr...,means year coming end spend much time great ou...,"[its, means, year, coming, end, spend, much, t...","[its, mean, year, come, end, spend, much, time..."
9,9,946512381756264449,10,39,2017-12-28 22:45:11,Did bring you new gear? The only thing left is...,"['#Santa', '#camping', '#2018camping']","['https://t.co/ePsOQZnx5q', 'https://t.co/nNb9...",Did bring you new gear? The only thing left is...,bring new gear? thing left book – reservations...,bring new gear thing left book reservations s...,bring new gear thing left book reservations st...,"[bring, new, gear, thing, left, book, reservat...","[bring, new, gear, thing, left, book, reserv, ..."


In [11]:
token_lst = df['text_tokenized'].tolist()
token_lst
token_fdist = FreqDist()
for list_i in token_lst:
    list_i = set(list_i)  # Adding this line would count a word once even if it appears multple times in one comment/post
    for token in list_i:
        token_fdist[token.lower()] += 1
token_fdist.most_common(30)
#token_fdist['justin_trudeau']       # check the frequency of a token

[('us', 76),
 ('its', 70),
 ('see', 67),
 ('places', 59),
 ('check', 58),
 ('share', 56),
 ('get', 55),
 ('park', 54),
 ('national', 53),
 ('time', 46),
 ('nhs', 46),
 ('amp', 43),
 ('one', 42),
 ('can', 40),
 ('will', 39),
 ('new', 38),
 ('s', 38),
 ('c', 37),
 ('parks', 37),
 ('sites', 37),
 ('learn', 37),
 ('week', 36),
 ('show', 34),
 ('favourite', 34),
 ('want', 33),
 ('fun', 33),
 ('d', 33),
 ('take', 33),
 ('today', 32),
 ('best', 31)]

In [12]:
stemmer_lst = df['text_stemmed'].tolist()
stemmer_lst
stemmer_fdist = FreqDist()
for list_i in stemmer_lst:
    list_i = set(list_i)  # Adding this line would count a word once even if it appears multple times in one comment/post
    for token in list_i:
        stemmer_fdist[token.lower()] += 1
stemmer_fdist.most_common(30)
#stemmer_fdist['justin_trudeau']       # stemmer doesn't change multi-word tokens

[('park', 88),
 ('place', 79),
 ('us', 76),
 ('its', 70),
 ('see', 68),
 ('share', 65),
 ('week', 63),
 ('site', 61),
 ('check', 60),
 ('get', 59),
 ('nation', 56),
 ('canada', 51),
 ('adventur', 47),
 ('time', 47),
 ('photo', 47),
 ('nhs', 46),
 ('plan', 44),
 ('year', 44),
 ('one', 43),
 ('amp', 43),
 ('learn', 41),
 ('can', 40),
 ('will', 39),
 ('new', 38),
 ('s', 38),
 ('c', 37),
 ('take', 37),
 ('favourit', 36),
 ('make', 35),
 ('start', 34)]

## Output token/stemmer frequency distribution

In [13]:
token_df = pd.DataFrame(list(token_fdist.items()), columns=['token', 'tok_freq'])
token_df['tok_freq_perc'] = token_df.tok_freq/len(df)
token_df = token_df.sort_values('tok_freq', ascending=False).reset_index(drop=True)

stemmer_df = pd.DataFrame(list(stemmer_fdist.items()), columns=['stemmer', 'stem_freq'])
stemmer_df['stem_freq_perc'] = stemmer_df.stem_freq/len(df)
stemmer_df = stemmer_df.sort_values('stem_freq', ascending=False).reset_index(drop=True)

print('>>> Output word frequency distribution for ' + filename)
output_df = pd.concat([token_df, stemmer_df], axis=1)
output_df.to_csv(outputDir + outputFileName, index=None)      

Output Stemmer frequency 
