### What this script does:

- Remove stopwords: English & French stopwords are loaded from two txt files into two sets. When a comment/post is detected as in French, detectable French stopwords will be removed. Otherwise, English stopwords will be removed accordingly. Note: the stopwords list needs to be updated to suit our purpose, for example, when we need to find out about the attitude/sentiment of comments, we should probably exclude words such as 'couldn't', 'cannot' or 'mustn't' from this list. 


- Remove non ascii characters: after this step, only digits, English & French characters are kept. Turn it off if it's unnecessary or removes too much implicit information such as emoji's.


- Tokenization: with MWETokenizer, multi-word tokens can be added based on our needs, for example, I've added 'climate change', 'canada150', 'justin trudeau' as customized tokens and counted their frequency in later step. 


- Stemming: stem the tokens obtained from last step.


- Output tokens' & stemmers' frequency distribution: output csv files can be found in folder 'word_count', frequency of tokens and stemmers are listed in descending order. The freq_perc column is obtained by dividing the frequency of a word by the total number of comment/posts in each original csv file. Note: Note: there are duplicated tokens in each comment/post, we can choose to count the duplicate or not by commenting/uncommenting a line of code.


In [1]:
import glob
import os
import pandas as pd
from langdetect import detect
import nltk
#nltk.download()   # comment after first download
from nltk.tokenize import sent_tokenize, MWETokenizer, wordpunct_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import FrenchStemmer
from nltk.stem.snowball import EnglishStemmer
from nltk.probability import FreqDist

## Choose a path

In [2]:
#======= Twitter =======
rootPath='../tw/cleaned data/*.csv'
COLUMN_NAME = 'full_text_cleaned'

#======= Facebook =======
#rootPath='../fb/cleaned data/comments/*.csv'
#COLUMN_NAME = 'comment_message_cleaned'

#======= Instagram =======
#rootPath='../in/cleaned data/posts/*.csv'
#COLUMN_NAME = 'caption'


filePaths = glob.glob(rootPath)  
display(filePaths)
multiWordsPath = './multiwords.txt'

['../tw/cleaned data/ParksCanada_tweets.csv',
 '../tw/cleaned data/ParcsCanada_tweets.csv']

In [3]:
testfile = '../tw/cleaned data/ParksCanada_tweets.csv'
filename = os.path.basename(testfile)

In [4]:
wordCountOutputDir = os.path.dirname(rootPath).replace('cleaned data', 'filtered_data') + '/word_count/'
if not os.path.exists(wordCountOutputDir):
    os.makedirs(wordCountOutputDir)
cleanedDataFileName = testfile.replace('cleaned data', 'filtered_data')
wordCountFileName = wordCountOutputDir + os.path.splitext(filename)[0] + '_word_count.csv'
display(wordCountOutputDir)
display(cleanedDataFileName)
display(wordCountFileName)

'../tw/filtered_data/word_count/'

'../tw/filtered_data/ParksCanada_tweets.csv'

'../tw/filtered_data/word_count/ParksCanada_tweets_word_count.csv'

In [5]:
stopWords_en = set(stopwords.words('english'))
stopWords_fr = set(stopwords.words('french'))

In [6]:
def detect_lang(text):
    try:
        lang = detect(text)
    except:
        return 'error'
    return lang


def filter_stop_words(text):   
    stopWords = stopWords_en  
    if detect_lang(text) == 'fr':
        stopWords = stopWords_fr  
    filtered_text = [w for w in wordpunct_tokenize(text) if w.lower() not in stopWords
            and len(w) > 1 and w.isalnum()]   
    return ' '.join(filtered_text)


def load_multi_words(filepath):
    with open(filepath) as file:
        lines = file.readlines()
        words = [word.strip() for word in lines]
        return words


def tokenize_multi_words(topic_list):
    result = []
    print('>>> Adding custermized topic words/tokens to tokenizer...')
    for words in topic_list:
        print(words)
        result.append(words.split('_'))
    return result

multiWords = load_multi_words(multiWordsPath)       # load custermized multi-word tokens
tokenizedMultiWords = tokenize_multi_words(multiWords)
tokenizer = MWETokenizer(tokenizedMultiWords)
#tokenizer = MWETokenizer()    # Uncomment this line if no customized multi-word tokens needed

def tokenize_text(text):
    return tokenizer.tokenize(text.split())   # remove .lower()


def stem_text(text):
    if detect_lang(text) == 'fr':
        stemmer = FrenchStemmer(ignore_stopwords=False)
    else:
        stemmer = EnglishStemmer(ignore_stopwords=False) 
    stems = [stemmer.stem(tok) for tok in text]
    return stems


>>> Adding custermized topic words/tokens to tokenizer...
climate_change
canada150
justin_trudeau


## Load csv files, filter stopwords

In [7]:
df = pd.read_csv(testfile)   # the first unnamed column already exists in csv file 

In [8]:
df['text_filtered'] = df[COLUMN_NAME].astype(str).apply(filter_stop_words)
#pd.options.display.max_rows = 999

## Load multi-word tokens, tokenize comments/posts, and stem tokens

In [9]:
df['text_tokenized'] = df['text_filtered'].apply(tokenize_text)
df['text_stemmed'] = df['text_tokenized'].apply(stem_text)
df
print('>>> Output cleaned data for ' + filename)
df.to_csv(cleanedDataFileName, index=None) 

>>> Output cleaned data for ParksCanada_tweets.csv


In [10]:
token_lst = df['text_tokenized'].tolist()
token_lst
token_fdist = FreqDist()
for list_i in token_lst:
    list_i = set(list_i)  # Adding this line would count a word once even if it appears multple times in one comment/post
    for token in list_i:
        token_fdist[token.lower()] += 1
#token_fdist.most_common(30)
#token_fdist['justin_trudeau']       # check the frequency of a token

In [11]:
stemmer_lst = df['text_stemmed'].tolist()
stemmer_lst
stemmer_fdist = FreqDist()
for list_i in stemmer_lst:
    list_i = set(list_i)  # Adding this line would count a word once even if it appears multple times in one comment/post
    for token in list_i:
        stemmer_fdist[token.lower()] += 1
#stemmer_fdist.most_common(30)
display(stemmer_fdist['canada150']) # stemmer doesn't change multi-word tokens
display(stemmer_fdist['justin_trudeau'])        

0

0

## Output token/stemmer frequency distribution

In [12]:
token_df = pd.DataFrame(list(token_fdist.items()), columns=['token', 'tok_freq'])
token_df['tok_freq_perc'] = token_df.tok_freq/len(df)
token_df = token_df.sort_values('tok_freq', ascending=False).reset_index(drop=True)

stemmer_df = pd.DataFrame(list(stemmer_fdist.items()), columns=['stemmer', 'stem_freq'])
stemmer_df['stem_freq_perc'] = stemmer_df.stem_freq/len(df)
stemmer_df = stemmer_df.sort_values('stem_freq', ascending=False).reset_index(drop=True)

print('>>> Output word frequency distribution for ' + filename)
output_df = pd.concat([token_df, stemmer_df], axis=1)
output_df.to_csv(wordCountFileName, index=None)   
output_df

>>> Output word frequency distribution for ParksCanada_tweets.csv


Unnamed: 0,token,tok_freq,tok_freq_perc,stemmer,stem_freq,stem_freq_perc
0,us,76,0.097812,park,88.0,0.113256
1,see,67,0.086229,place,79.0,0.101673
2,week,61,0.078507,us,76.0,0.097812
3,places,59,0.075933,see,68.0,0.087516
4,check,58,0.074646,share,65.0,0.083655
5,share,56,0.072072,week,63.0,0.081081
6,get,55,0.070785,site,61.0,0.078507
7,park,54,0.069498,check,60.0,0.077220
8,national,53,0.068211,get,59.0,0.075933
9,canada,51,0.065637,nation,56.0,0.072072
