In [1]:
import glob
import os
import pandas as pd
import nltk
#nltk.download()   # comment after first download
from nltk.tokenize import MWETokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import gensim
from gensim import corpora
from gensim.models import Word2Vec
import string
from numbers import Number
from pprint import pprint
import logging
import operator
from pprint import pprint
pd.options.display.max_rows = 30

In [2]:
keywords_chosen = 'Que5_Que6 KeywordLists.csv'

data_folder = './Accounts/*.csv'
OUTPUT_COLS = ['caption_original']


In [3]:
# create output directory
outputDir = os.path.dirname(data_folder) + '/q5q6_output/'
if not os.path.exists(outputDir):
    os.makedirs(outputDir)

In [4]:
# set of punctuations to remove from text
exclude = set(string.punctuation)

In [5]:
stopWords = set(stopwords.words('english'))
stopWords.add('theyre')
stopWords.add("we're")
stopWords.add("we've") 

lemma = WordNetLemmatizer()    # NLTK English lemmatizer

# detect_lang function can be use to check the percentage of non English posts
# note that missing value NaN can be detected as many different languages such english, spanish or italian
def detect_lang(text):
    try:
        lang = detect(text)
    except:
        return 'error'
    return lang

# lemmatize_keywords also clears 'nan' from input keyword list file
# lemmatization is conducted based on context, some words may not get lemmatized, 
# e.g. "local eating" does not get lemmatized to "local eat"
def lemmatize_keywords(col):
    if col.lower() == 'nan':
        return ''
    # if a stopword appreas in given keyword list, this stopword will be removed from stopword list
    for keyword in col.split():
        if keyword in stopWords:
            print(keyword)
            stopWords.remove(keyword)
    return '_'.join(lemma.lemmatize(word).lower() for word in col.replace('’', '\'').replace('.', '').split()) #lemma.lemmatize(word)


In [6]:
#print(stopWords)

In [7]:
# load keywords list
pd.options.display.max_rows = 100
keywords_df = pd.read_csv(keywords_chosen, encoding='utf-8')   # "ISO-8859-1"

keywords_df['lemmatized_keywords'] = keywords_df['Actions to advance sustainablity'].astype(str).apply(lemmatize_keywords)
keywords_list = set(keywords_df['lemmatized_keywords'].tolist())
display(keywords_df)

# if there are punctuations in the keywords list, these punctuation will be kept regardless of puncturation removal step
for word in keywords_list:
    for char in word:
        if char in exclude:
            exclude.remove(char)
            
# Add all words in the given keyword list to pre-defined token dictionary
multi_word = [w.split('_') for w in keywords_list ] 
tokenizer = MWETokenizer(multi_word)

down
the
and
own
your
off


Unnamed: 0,Actions to advance sustainablity,lemmatized_keywords
0,eating_local,eating_local
1,biking,biking
2,composting,composting
3,recycling,recycling
4,saving,saving
5,hiking,hiking
6,fishing,fishing
7,camping,camping
8,growing,growing
9,preserving,preserving


In [8]:
def lemmatize_text(col): 
    text = col.replace('’', '\'')
    tokens = tokenizer.tokenize(text.split())   
    # remove stop words
    stop_free = ' '.join(w for w in tokens if w.lower() not in stopWords and len(w) > 1)
    # remove punctuation
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    # lemmatize
    lemmas = ' '.join(lemma.lemmatize(word).lower() for word in punc_free.split() if len(lemma.lemmatize(word)) > 1)
    return lemmas.split()

# assign a category based the max number of keywords found in each category
def find_matched_keywords(col):
    keywords_found = []
    for word in col:
        if word in keywords_list:
            keywords_found.append(word) 
    return keywords_found

## Read and merge input csv files

In [9]:
pd.options.display.max_rows = 10
# read csv files and save targt columns to dataframe
filePaths = glob.glob(data_folder)  
data_df = pd.DataFrame(columns=OUTPUT_COLS)
for filename in filePaths:
    print(filename)    
    df_i = pd.read_csv(filename, encoding = 'utf-8')
    df_i = df_i[OUTPUT_COLS]
    df_x = df_i.dropna(subset=['caption_original']) 
    data_df = data_df.append(df_x, ignore_index=True)

display(data_df)

./Accounts/ParksCanada_tweets.csv
./Accounts/CanadianAgriculture_facebook_statuses.csv
./Accounts/CCG_GCC_tweets.csv
./Accounts/FisheriesOceansCanada_facebook_statuses.csv
./Accounts/CanadianCoastGuard_facebook_statuses.csv
./Accounts/YourMoneyMattersCanada_facebook_statuses.csv
./Accounts/EnvironmentandClimateChange_facebook_statuses.csv
./Accounts/parks.canada_posts.csv
./Accounts/TransportandInfrastructureinCanada_facebook_statuses.csv
./Accounts/environmentca_tweets.csv
./Accounts/NRCan_tweets.csv
./Accounts/ENERGYSTAR_CAN_tweets.csv
./Accounts/DFO_Gulf_tweets.csv
./Accounts/ec_minister_tweets.csv
./Accounts/DFO_Central_tweets.csv
./Accounts/DFO_Pacific_tweets.csv
./Accounts/DFO_NL_tweets.csv
./Accounts/DFO_CCG_Quebec_tweets.csv
./Accounts/DFO_Science_tweets.csv
./Accounts/Transport_gc_tweets.csv
./Accounts/GiantMine_tweets.csv
./Accounts/TSBCanada_tweets.csv
./Accounts/CTA_gc_tweets.csv
./Accounts/AskISED_tweets.csv
./Accounts/DFO_MPO_tweets.csv
./Accounts/GovCanNorth_facebook_sta

Unnamed: 0,caption_original
0,"RT @CanadianPM: 117,000 poppies, one for each ..."
1,A big thank you 🎈 for an incredible #Canada150...
2,Qausuittuq National Park is officially open! L...
3,It’s the first day of #PolarBearWeek! ❄️ From ...
4,It’s #RemembranceDay 🇨🇦 Honour those who have ...
...,...
44998,Be part of the conversation of a generation! M...
44999,Funding from Budget will increase charging/ref...
45000,The Government of Canada approved the Enbridge...
45001,The Government of Canada approved the Trans Mo...


In [10]:
pd.options.display.max_rows = 50
try:
    data_df['lemmatized_text'] = data_df['caption_original'].astype(str).apply(lemmatize_text)
except:
    print('cannot process file...')
data_df['Action_matched_keywords'] = data_df['lemmatized_text'].apply(find_matched_keywords)
display(data_df[['lemmatized_text', 'Action_matched_keywords']])

Unnamed: 0,lemmatized_text,Action_matched_keywords
0,"[rt, canadianpm, 117000, poppy, one, canada's,...",[]
1,"[big, thank, incredible, canada150, year, beha...",[]
2,"[qausuittuq, national, park, officially, open,...",[]
3,"[the, first, day, polarbearweek, ❄️, cool, pho...",[]
4,"[remembranceday, 🇨🇦, honour, served, country, ...",[]
5,"[rt, watertonlakesnp, thank, the, crew, worked...",[]
6,"[exciting, news, starting, 2018, admission, pa...",[]
7,"[mentalhealthawarenessweek, want, hear, you, p...",[]
8,"[worldkindnessday, important, kind, the, show,...","[protecting, ecosystem]"
9,"[baby, bat, called, pup, and, bat, one, year, ...",[]


In [11]:
pd.options.display.max_rows = 100
output_df = data_df[data_df['Action_matched_keywords'].astype(str) != '[]']
output_df

Unnamed: 0,caption_original,lemmatized_text,Action_matched_keywords
8,It’s #WorldKindnessDay! It’s so important to b...,"[worldkindnessday, important, kind, the, show,...","[protecting, ecosystem]"
18,"NEW: starting January 1st, admission to our pl...","[new, starting, january, 1st, admission, place...",[keeping]
38,A day of #hiking will take you to #TwinFalls. ...,"[day, hiking, take, twinfalls, share, photo, t...",[hiking]
51,Happy #WorldWildlifeConservationDay! 🦋🐻🐢 Parks...,"[happy, worldwildlifeconservationday, 🦋🐻🐢, par...",[working]
55,#DYK #bats native to Canada are insect-eaters ...,"[dyk, bat, native, canada, insect-eaters, and,...",[hunting]
63,We love keeping our national parks free of was...,"[love, keeping, national, park, free, waste, c...","[keeping, consider, recycling, composting, cam..."
76,Happy #LighthouseDay! #DYK the oldest survivin...,"[happy, lighthouseday, dyk, the, oldest, survi...",[surviving]
87,RT @WatertonLakesNP: The Kenow Fire is demonst...,"[rt, watertonlakesnp, the, kenow, fire, demons...",[demonstrating]
105,"Summer is coming to an end, and what an amazin...","[summer, coming, end, and, amazing, summer, wa...",[planning]
106,#DYK the #borealforest is one of the largest #...,"[dyk, the, borealforest, one, the, largest, ec...",[ecosystem]


## Only one line of code is needed to train word embedding model

In [12]:
model = Word2Vec(data_df['lemmatized_text'], size=600, window=50, min_count=20)
# save model
model.save('word2vec_model.bin')
## uncomment the following line of code to load an existing model instead of training a new one.
#model = Word2Vec.load('word2vec_model.bin')
#list(model.wv.vocab)

## find new action words using existing actions

In [13]:
output_dict = {}
top_n = 30
counter = 1
for keyword in keywords_list:
    try:
        tuple_list = model.wv.most_similar(positive=[keyword], topn=top_n)
    except KeyError:
        print(str(counter) + ': keyword \"' + keyword + '\" is not found...')
        counter += 1
        continue
    new_actions = set(ele[0] for ele in tuple_list if ele not in keywords_list)
    output_dict[keyword] = new_actions
pprint(output_dict)
print(str(counter-1) + ' given keywords not found in the embedding model..')

1: keyword "designing" is not found...
2: keyword "overhauling" is not found...
3: keyword "green_purchasing" is not found...
4: keyword "ceramic_baking_pan" is not found...
5: keyword "taking_home_food" is not found...
6: keyword "local_market" is not found...
7: keyword "cooling" is not found...
8: keyword "solving" is not found...
9: keyword "meal_plan" is not found...
10: keyword "food_product" is not found...
11: keyword "eco-friendly_agent" is not found...
12: keyword "wood_product" is not found...
13: keyword "speeding" is not found...
14: keyword "grocery_bag" is not found...
15: keyword "farmer's_market" is not found...
16: keyword "decision-making" is not found...
17: keyword "le_methane_release" is not found...
18: keyword "preserving" is not found...
19: keyword "beautifying" is not found...
20: keyword "bpa-free" is not found...
21: keyword "shopping_online" is not found...
22: keyword "eco-savvy" is not found...
23: keyword "expired" is not found...
24: keyword "recycling

           'backtoschool',
           'bchydro',
           'choose',
           'clothes',
           'clothing',
           'dinner',
           'dish',
           'dress',
           'energysaving',
           'game',
           'giving',
           'greenhalloween',
           'helpful',
           'indoors',
           'loved',
           'plenty',
           'recipe',
           'save5',
           'setting',
           'shop',
           'so',
           'spending',
           'staying',
           'stress',
           'surprise',
           'thermostat',
           'trick',
           'turn',
           'unplug'},
 'building': {'address',
              'agreement',
              'cdn',
              'china',
              'cleantech',
              'committed',
              'cooperation',
              'economic',
              'fund',
              'growth',
              'industry',
              'infrastructure',
              'initiative',
              'investing',
      

                'missioninnovation',
                'modernizing',
                'nuclear',
                'poweringpastcoal',
                'pricing',
                'promote',
                'resilience',
                'transition',
                'uk',
                "🇨🇦's"},
 'paying': {'already',
            'bought',
            'calculator',
            'carseat',
            'cash',
            'charged',
            'charity',
            'choose',
            'choosing',
            'clothing',
            'convenient',
            'helpful',
            'insolvency',
            'lit',
            'mail',
            'mortgage',
            'payday',
            'print',
            'purchase',
            'qualify',
            'rate',
            'someone',
            'stress',
            'that',
            'traveltuesdays',
            'trustee',
            'unwanted',
            'warranty',
            'wondering',
            'written'},
 'planning': {'