In [1]:
import glob
import os
import pandas as pd
import nltk
#nltk.download()   # comment after first download
from nltk.tokenize import MWETokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import gensim
from gensim import corpora
from gensim.models import Word2Vec
import string
from numbers import Number
from pprint import pprint
import logging
import operator
from pprint import pprint
pd.options.display.max_rows = 30

In [2]:
keywords_chosen = 'Que5_Que6 KeywordLists.csv'

data_folder = './Accounts/*.csv'
OUTPUT_COLS = ['id','date_published','link','caption_original','caption_cleaned','hashtags','num_comments',
               'num_shares','num_likes','Reactions_SUM','category','matched_keywords','language',
               'average_sentiment_score','sentiment', 'Action_matched_keywords']


In [3]:
# create output directory
outputDir = os.path.dirname(data_folder) + '/q5q6_output/'
if not os.path.exists(outputDir):
    os.makedirs(outputDir)

In [4]:
# set of punctuations to remove from text
exclude = set(string.punctuation)

In [5]:
stopWords = set(stopwords.words('english'))
stopWords.add('theyre')
stopWords.add("we're")
stopWords.add("we've") 

lemma = WordNetLemmatizer()    # NLTK English lemmatizer

# detect_lang function can be use to check the percentage of non English posts
# note that missing value NaN can be detected as many different languages such english, spanish or italian
def detect_lang(text):
    try:
        lang = detect(text)
    except:
        return 'error'
    return lang

# lemmatize_keywords also clears 'nan' from input keyword list file
# lemmatization is conducted based on context, some words may not get lemmatized, 
# e.g. "local eating" does not get lemmatized to "local eat"
def lemmatize_keywords(col):
    if col.lower() == 'nan':
        return ''
    # if a stopword appreas in given keyword list, this stopword will be removed from stopword list
    for keyword in col.split():
        if keyword in stopWords:
            print(keyword)
            stopWords.remove(keyword)
    return '_'.join(lemma.lemmatize(word).lower() for word in col.replace('’', '\'').replace('.', '').split()) #lemma.lemmatize(word)


In [6]:
#print(stopWords)

In [7]:
# load keywords list
pd.options.display.max_rows = 100
keywords_df = pd.read_csv(keywords_chosen, encoding='utf-8')   # "ISO-8859-1"

keywords_df['lemmatized_keywords'] = keywords_df['Actions to advance sustainablity'].astype(str).apply(lemmatize_keywords)
keywords_list = set(keywords_df['lemmatized_keywords'].tolist())
display(keywords_df)

# if there are punctuations in the keywords list, these punctuation will be kept regardless of puncturation removal step
for word in keywords_list:
    for char in word:
        if char in exclude:
            exclude.remove(char)
            
# Add all words in the given keyword list to pre-defined token dictionary
multi_word = [w.split('_') for w in keywords_list ] 
tokenizer = MWETokenizer(multi_word)

down
the
own
off


Unnamed: 0,Actions to advance sustainablity,lemmatized_keywords
0,eating_local,eating_local
1,biking,biking
2,composting,composting
3,recycling,recycling
4,saving,saving
5,hiking,hiking
6,fishing,fishing
7,camping,camping
8,growing,growing
9,preserving,preserving


In [8]:
def lemmatize_text(col): 
    text = col.replace('’', '\'')
    tokens = tokenizer.tokenize(text.split())   
    # remove stop words
    stop_free = ' '.join(w for w in tokens if w.lower() not in stopWords and len(w) > 1)
    # remove punctuation
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    # lemmatize
    lemmas = ' '.join(lemma.lemmatize(word).lower() for word in punc_free.split() if len(lemma.lemmatize(word)) > 1)
    return lemmas.split()

# assign a category based the max number of keywords found in each category
def find_matched_keywords(col):
    keywords_found = []
    for word in col:
        if word in keywords_list:
            keywords_found.append(word) 
    return keywords_found

## Read and merge input csv files

In [9]:
pd.options.display.max_rows = 10
# read csv files and save targt columns to dataframe
filePaths = glob.glob(data_folder)  
data_df = pd.DataFrame(columns=OUTPUT_COLS)
for filename in filePaths:
    print(filename)    
    df_i = pd.read_csv(filename, encoding = 'utf-8')
    df_x = df_i.dropna(subset=['caption_original']) 
    data_df = data_df.append(df_x, ignore_index=True)

display(data_df)

./Accounts/ParksCanada_tweets.csv
./Accounts/CanadianAgriculture_facebook_statuses.csv
./Accounts/CCG_GCC_tweets.csv
./Accounts/FisheriesOceansCanada_facebook_statuses.csv
./Accounts/CanadianCoastGuard_facebook_statuses.csv
./Accounts/YourMoneyMattersCanada_facebook_statuses.csv
./Accounts/EnvironmentandClimateChange_facebook_statuses.csv
./Accounts/parks.canada_posts.csv
./Accounts/TransportandInfrastructureinCanada_facebook_statuses.csv
./Accounts/environmentca_tweets.csv
./Accounts/NRCan_tweets.csv
./Accounts/ENERGYSTAR_CAN_tweets.csv
./Accounts/DFO_Gulf_tweets.csv
./Accounts/ec_minister_tweets.csv
./Accounts/DFO_Central_tweets.csv
./Accounts/DFO_Pacific_tweets.csv
./Accounts/DFO_NL_tweets.csv
./Accounts/DFO_CCG_Quebec_tweets.csv
./Accounts/DFO_Science_tweets.csv
./Accounts/Transport_gc_tweets.csv
./Accounts/GiantMine_tweets.csv
./Accounts/TSBCanada_tweets.csv
./Accounts/CTA_gc_tweets.csv
./Accounts/AskISED_tweets.csv
./Accounts/DFO_MPO_tweets.csv
./Accounts/GovCanNorth_facebook_sta

Unnamed: 0.1,Action_matched_keywords,Reactions_SUM,Unnamed: 0,account_name,average_sentiment_score,caption_cleaned,caption_original,category,date_published,hashtags,...,num_hahas,num_likes,num_loves,num_reactions,num_sads,num_shares,num_special,num_wows,sentiment,words_matched_list
0,,2372,0.0,ParksCanada_tweeter,-0.155556,RT CanadianPM 117 000 poppies one Fallen casca...,"RT @CanadianPM: 117,000 poppies, one for each ...",unknown,2017-11-10 22:37:12,['#CanadaRemembers'],...,,0,,,,2372,,,Neutral,
1,,1970,1.0,ParksCanada_tweeter,0.300000,big thank incredible Canada150 year behalf ent...,A big thank you 🎈 for an incredible #Canada150...,Environmental,2017-12-28 20:05:10,"['#Canada150', '#ParksCanada']",...,,1471,,,,499,,,Neutral,"['park', 'or']"
2,,595,2.0,ParksCanada_tweeter,0.220000,Qausuittuq National Park officially open Learn...,Qausuittuq National Park is officially open! L...,Environmental,2017-08-10 19:20:34,"['#Arctic', '#Nunavut']",...,,433,,,,162,,,Neutral,"['lands', 'park']"
3,,510,3.0,ParksCanada_tweeter,0.214583,first day PolarBearWeek cool photos fun facts ...,It’s the first day of #PolarBearWeek! ❄️ From ...,unknown,2017-11-06 00:25:05,['#PolarBearWeek'],...,,373,,,,137,,,Neutral,
4,,439,4.0,ParksCanada_tweeter,0.000000,RemembranceDay Honour served country share pho...,It’s #RemembranceDay 🇨🇦 Honour those who have ...,Environmental,2017-11-11 13:15:17,"['#RemembranceDay', '#ParksChallenge']",...,,333,,,,106,,,Neutral,['park']
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44998,,3,232.0,naturalresourcescanada_instagram,0.000000,part conversation generation Min Sohi invites ...,Be part of the conversation of a generation! M...,Economical,2017-10-04T08:22:12,"[""genenergy""]",...,,3,,,,,,,neutral,['energy']
44999,,3,233.0,naturalresourcescanada_instagram,0.000000,Funding Budget 2016 increase charging refuelli...,Funding from Budget will increase charging/ref...,Economical,2016-12-08T08:45:08,"[""electricvehicles"",""naturalgas"",""electricity""...",...,,3,,,,,,,neutral,"['infrastructure', 'funding']"
45000,,2,234.0,naturalresourcescanada_instagram,0.000000,Government Canada approved Enbridge Line Repla...,The Government of Canada approved the Enbridge...,Environmental,2016-12-08T08:20:07,"[""pipeline"",""canada"",""economy"",""replace"",""natu...",...,,2,,,,,,,neutral,"['natural', 'resources', 'government']"
45001,,1,235.0,naturalresourcescanada_instagram,0.000000,Government Canada approved Trans Mountain Expa...,The Government of Canada approved the Trans Mo...,Environmental,2016-12-08T08:17:30,"[""pipeline"",""canada"",""economy"",""expansion"",""tm...",...,,1,,,,,,,neutral,"['natural', 'resources', 'government']"


In [10]:
pd.options.display.max_rows = 50
try:
    data_df['lemmatized_text'] = data_df['caption_original'].astype(str).apply(lemmatize_text)
except:
    print('cannot process file...')
data_df['Action_matched_keywords'] = data_df['lemmatized_text'].apply(find_matched_keywords)
display(data_df)

Unnamed: 0.1,Action_matched_keywords,Reactions_SUM,Unnamed: 0,account_name,average_sentiment_score,caption_cleaned,caption_original,category,date_published,hashtags,...,num_likes,num_loves,num_reactions,num_sads,num_shares,num_special,num_wows,sentiment,words_matched_list,lemmatized_text
0,[],2372,0.0,ParksCanada_tweeter,-0.155556,RT CanadianPM 117 000 poppies one Fallen casca...,"RT @CanadianPM: 117,000 poppies, one for each ...",unknown,2017-11-10 22:37:12,['#CanadaRemembers'],...,0,,,,2372,,,Neutral,,"[rt, canadianpm, 117000, poppy, one, canada's,..."
1,[],1970,1.0,ParksCanada_tweeter,0.300000,big thank incredible Canada150 year behalf ent...,A big thank you 🎈 for an incredible #Canada150...,Environmental,2017-12-28 20:05:10,"['#Canada150', '#ParksCanada']",...,1471,,,,499,,,Neutral,"['park', 'or']","[big, thank, incredible, canada150, year, beha..."
2,[learn],595,2.0,ParksCanada_tweeter,0.220000,Qausuittuq National Park officially open Learn...,Qausuittuq National Park is officially open! L...,Environmental,2017-08-10 19:20:34,"['#Arctic', '#Nunavut']",...,433,,,,162,,,Neutral,"['lands', 'park']","[qausuittuq, national, park, officially, open,..."
3,"[cool, bear]",510,3.0,ParksCanada_tweeter,0.214583,first day PolarBearWeek cool photos fun facts ...,It’s the first day of #PolarBearWeek! ❄️ From ...,unknown,2017-11-06 00:25:05,['#PolarBearWeek'],...,373,,,,137,,,Neutral,,"[the, first, day, polarbearweek, ❄️, cool, pho..."
4,[],439,4.0,ParksCanada_tweeter,0.000000,RemembranceDay Honour served country share pho...,It’s #RemembranceDay 🇨🇦 Honour those who have ...,Environmental,2017-11-11 13:15:17,"['#RemembranceDay', '#ParksChallenge']",...,333,,,,106,,,Neutral,['park'],"[remembranceday, 🇨🇦, honour, served, country, ..."
5,[protect],434,5.0,ParksCanada_tweeter,0.000000,RT WatertonLakesNP Thank crews worked protect ...,RT @WatertonLakesNP: Thank you to all the crew...,Environmental,2017-09-16 01:03:37,['#Waterton'],...,0,,,,434,,,Neutral,"['water', 'park', 'lakes', 'or']","[rt, watertonlakesnp, thank, the, crew, worked..."
6,[],419,6.0,ParksCanada_tweeter,0.258333,Exciting news Starting 2018 admission ParksCan...,"Exciting news! Starting in 2018, admission to ...",Environmental,2017-11-30 19:15:27,"['#ParksCanada', '#FREE']",...,245,,,,174,,,Neutral,"['park', 'or']","[exciting, news, starting, 2018, admission, pa..."
7,[help],397,7.0,ParksCanada_tweeter,0.300000,MentalHealthAwarenessWeek want hear places hel...,"It’s #MentalHealthAwarenessWeek, so we want to...",Social,2017-10-01 16:05:15,"['#MentalHealthAwarenessWeek', '#MIAW17']",...,281,,,,116,,,Neutral,['health'],"[mentalhealthawarenessweek, want, hear, you, p..."
8,"[care, protecting, ecosystem]",337,8.0,ParksCanada_tweeter,0.500000,WorldKindnessDay important kind show care prot...,It’s #WorldKindnessDay! It’s so important to b...,Environmental,2017-11-13 13:10:15,['#WorldKindnessDay'],...,230,,,,107,,,Positive,"['ecosystem', 'biodiversity', 'or']","[worldkindnessday, important, kind, the, show,..."
9,[],330,9.0,ParksCanada_tweeter,0.280000,Baby bats called pups bats one year nursed big...,"Baby #bats are called ""pups"" and bats have onl...",unknown,2017-10-28 18:00:55,"['#bats', '#BatWeek']",...,244,,,,86,,,Neutral,,"[baby, bat, called, pup, bat, one, year, nurse..."


In [11]:
pd.options.display.max_rows = 100
output_df = data_df[data_df['Action_matched_keywords'].astype(str) != '[]']
output_df = output_df[OUTPUT_COLS]
output_df.to_csv(outputDir + 'q5q6_merged_ouput_EN.csv', index=None, encoding='utf-8')
output_df

Unnamed: 0,id,date_published,link,caption_original,caption_cleaned,hashtags,num_comments,num_shares,num_likes,Reactions_SUM,category,matched_keywords,language,average_sentiment_score,sentiment,Action_matched_keywords
2,895726583729856513,2017-08-10 19:20:34,"['https://t.co/3kCgJN4RYV', 'https://t.co/8d6M...",Qausuittuq National Park is officially open! L...,Qausuittuq National Park officially open Learn...,"['#Arctic', '#Nunavut']",,162,433,595,Environmental,,en,0.220000,Neutral,[learn]
3,927330962379366402,2017-11-06 00:25:05,['https://t.co/3SraumhUxm'],It’s the first day of #PolarBearWeek! ❄️ From ...,first day PolarBearWeek cool photos fun facts ...,['#PolarBearWeek'],,137,373,510,unknown,,en,0.214583,Neutral,"[cool, bear]"
5,908858879999713280,2017-09-16 01:03:37,['https://t.co/TzqLZx9U0L'],RT @WatertonLakesNP: Thank you to all the crew...,RT WatertonLakesNP Thank crews worked protect ...,['#Waterton'],,434,0,434,Environmental,,en,0.000000,Neutral,[protect]
7,914521600485191680,2017-10-01 16:05:15,['https://t.co/RtNVJ0U5ZO'],"It’s #MentalHealthAwarenessWeek, so we want to...",MentalHealthAwarenessWeek want hear places hel...,"['#MentalHealthAwarenessWeek', '#MIAW17']",,116,281,397,Social,,en,0.300000,Neutral,[help]
8,930060241269673984,2017-11-13 13:10:15,['https://t.co/A0Rz84ahSv'],It’s #WorldKindnessDay! It’s so important to b...,WorldKindnessDay important kind show care prot...,['#WorldKindnessDay'],,107,230,337,Environmental,,en,0.500000,Positive,"[care, protecting, ecosystem]"
13,915545833017413632,2017-10-04 11:55:11,"['https://t.co/nlPZZ6s2fj', 'https://t.co/pczk...",Today is #WorldAnimalDay! See what we’re doing...,Today WorldAnimalDay See protect speciesatrisk...,"['#WorldAnimalDay', '#speciesatrisk']",,90,213,303,Environmental,,en,0.000000,Neutral,[protect]
17,899288715738132481,2017-08-20 15:15:12,"['https://t.co/QMM6AKVd0w', 'https://t.co/71Kw...",Tomorrow you'll be able to see partial solar e...,Tomorrow able see partial solar eclipse many l...,[],,98,164,262,Environmental,,en,0.341667,Positive,[protect]
22,898897405738930176,2017-08-19 13:20:16,"['https://t.co/m16HnF1pjH', 'https://t.co/SH1r...",Happy #WorldHoneyBeeDay! 🐝 Help protect #wildl...,Happy WorldHoneyBeeDay Help protect wildlife u...,"['#WorldHoneyBeeDay', '#wildlife', '#iNaturali...",,79,163,242,Environmental,,en,1.000000,Positive,"[help, protect]"
26,915992557456207872,2017-10-05 17:30:18,['https://t.co/L4MXjRMi8S'],Get out &amp; experience the colours of fall! ...,Get amp experience colours fall Grab scarf hea...,['#ParksChallenge'],,57,176,233,Environmental,,en,0.250000,Neutral,[experience]
30,944005857536282624,2017-12-22 00:45:09,"['https://t.co/GyXd6fMmx', 'https://t.co/ZdgQY...",It’s officially the shortest day of the year… ...,officially shortest day WinterSolstice Take ad...,['#WinterSolstice'],,55,153,208,Environmental,,en,-0.150000,Neutral,[preserve]


## Only one line of code is needed to train word embedding model

In [12]:
model = Word2Vec(data_df['lemmatized_text'], size=600, window=50, min_count=20)
# save model
model.save('word2vec_model.bin')
## uncomment the following line of code to load an existing model instead of training a new one.
#model = Word2Vec.load('word2vec_model.bin')
#list(model.wv.vocab)

## find new action words using existing actions

In [13]:
output_dict = {}
top_n = 30
counter = 1
for keyword in keywords_list:
    try:
        tuple_list = model.wv.most_similar(positive=[keyword], topn=top_n)
    except KeyError:
        print(str(counter) + ': keyword \"' + keyword + '\" is not found...')
        counter += 1
        continue
    new_actions = set(ele[0] for ele in tuple_list if ele not in keywords_list)
    output_dict[keyword] = new_actions
pprint(output_dict)
print(str(counter-1) + ' given keywords not found in the embedding model..')

1: keyword "online_shopping" is not found...
2: keyword "solve" is not found...
3: keyword "cure" is not found...
4: keyword "freezer" is not found...
5: keyword "recycle" is not found...
6: keyword "wasting_food" is not found...
7: keyword "cut_down_gas_emission" is not found...
8: keyword "pursuing" is not found...
9: keyword "drink" is not found...
10: keyword "decision-making" is not found...
11: keyword "medication" is not found...
12: keyword "wood_product" is not found...
13: keyword "le_packaging" is not found...
14: keyword "cooling" is not found...
15: keyword "expired" is not found...
16: keyword "locally_grown_food" is not found...
17: keyword "tempting" is not found...
18: keyword "unpacking_grocery" is not found...
19: keyword "surviving" is not found...
20: keyword "expiration_date" is not found...
21: keyword "overhauling" is not found...
22: keyword "light_off" is not found...
23: keyword "driving_le" is not found...
24: keyword "strenghtening" is not found...
25: keyw

         'appliance',
         'backtoschool',
         'blackfriday',
         'clothes',
         'costume',
         'download',
         'easy',
         'energyefficient',
         'energysaving',
         'game',
         'gift',
         'giving',
         'got',
         'kid',
         "kids'",
         'let',
         'loved',
         'movie',
         'own',
         'pcapp',
         'recipe',
         'shopping',
         'something',
         'soon',
         'spending',
         "there's",
         'thermostat',
         'turn',
         'up'},
 'address': {'agreement',
             'budget2017',
             'canadachina',
             'china',
             'cleantech',
             'collaboration',
             'commitment',
             'committed',
             'cooperation',
             'developing',
             'discussed',
             'economic',
             'economy',
             'gc',
             'greater',
             'growth',
             'implementin

                   'discussed',
                   'electricity',
                   'greenhouse',
                   'keynote',
                   'leading',
                   'low_carbon',
                   'macaulay',
                   'ministerial',
                   'nuclear',
                   'paris',
                   'pricing',
                   'ps',
                   'renewable',
                   'rudd',
                   'scott',
                   'specializing',
                   'treatment',
                   'uk',
                   '“we',
                   '🇨🇦'},
 'collaborate': {'aandcanada',
                 'achieve',
                 'addressing',
                 'brings',
                 'campaign',
                 'combat',
                 'continuing',
                 'convention',
                 'diversity',
                 'fbf',
                 "goc's",
                 'highlighting',
                 'integrated',
                 'in

                     'longer',
                     'print',
                     'rebate',
                     'save5',
                     'saveenergy',
                     'shopping',
                     'simple',
                     'store',
                     'thermostat',
                     'turn',
                     'tv',
                     'washer',
                     'wintertips'},
 'energysaving': {'bad',
                  'beat',
                  'blackfriday',
                  'christmas',
                  'clothes',
                  'clothing',
                  'coffee',
                  'comfort',
                  'costume',
                  'dress',
                  'extra',
                  'fan',
                  'feeling',
                  'game',
                  'gift',
                  'giving',
                  'halloween',
                  'hit',
                  'late',
                  'not',
                  'nothing',
       

                'contamination',
                'dangerous',
                'expect',
                'fisher',
                'immediately',
                'includes',
                'mariners',
                'operate',
                'prawn',
                'rec',
                'restriction',
                'sanitary',
                'shellfish',
                'strike',
                'subarea',
                'subareas',
                'suspicious',
                'tidepsp',
                'watertonlakesnp'},
 'heat': {'able',
          'avoid',
          'bag',
          'buy',
          'cancellation',
          'case',
          'cost',
          'cover',
          'device',
          'driving',
          'fast',
          'flysmart',
          'heater',
          'heating',
          'higher',
          'however',
          'item',
          'limit',
          'lower',
          'monthly',
          'phone',
          'pressure',
          'properly',
       

            'soil',
            'understand',
            'understanding',
            'wood',
            'yield'},
 'mining': {'agreement',
            'canadachina',
            'cdnpoli',
            'cleanenergy',
            'cleangrowth',
            'committed',
            'creating',
            'developing',
            'economic',
            'economy',
            'europe',
            'framework',
            'growth',
            'initiative',
            'innovative',
            'investing',
            'leader',
            'leadership',
            'mexico',
            'mineral',
            'mou',
            'nuclear',
            'organization',
            'pan-canadian',
            'promote',
            'relationship',
            'resilience',
            'sector',
            'strong',
            'trading'},
 'modernizing': {'achieve',
                 'achieving',
                 'acoacanada',
                 'addressed',
                 'boost',
     

             'coffee',
             'comfort',
             'cooler',
             'cup',
             'extra',
             'hit',
             'hitting',
             'inside',
             'late',
             'meal',
             'nodronezone',
             'not',
             'notabrightidea',
             'overnight',
             'pack',
             'piece',
             'pointing',
             'pool',
             'remind',
             'ride',
             'rise',
             'shine',
             'sleep',
             'visible',
             'warm'},
 'save': {'avoid',
          'baggage',
          'bulb',
          'buy',
          'buying',
          'card',
          'certified',
          'cost',
          'deal',
          'decide',
          'energyefficiency',
          'energystarcanada',
          'example',
          'expense',
          'fee',
          'heating',
          'item',
          'mean',
          'money',
          'own',
          'pay',
         