In [1]:
import glob
import os
import sys
import pandas as pd
import nltk
#nltk.download()   # comment after first download
from nltk.tokenize import MWETokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import gensim
from gensim import corpora
import string
from numbers import Number
from pprint import pprint
import logging
import operator
from FrenchLefffLemmatizer.FrenchLefffLemmatizer import FrenchLefffLemmatizer

pd.options.display.max_rows = 30

In [2]:
keywords_chosen = 'Keywords_ECCC_FR.csv'

data_folder = './Accounts/*.csv'


In [3]:
# create output directory
outputDir = os.path.dirname(data_folder) + '/q1_output/'
if not os.path.exists(outputDir):
    os.makedirs(outputDir)

In [4]:
# set of punctuations to remove from text
exclude = set(string.punctuation)

In [5]:
stopWords = set(stopwords.words('french'))

lemma = FrenchLefffLemmatizer()    # French lemmatizer

# lemmatize_keywords also clears 'nan' from input keyword list file
# lemmatization is conducted based on context, some words may not get lemmatized, 
# e.g. "local eating" does not get lemmatized to "local eat"
def lemmatize_keywords(col):
    if str(col).lower() == 'nan':
        return ''
    return '_'.join(lemma.lemmatize(word).lower() for word in col.split()) 


In [6]:
# load keywords list
pd.options.display.max_rows = 100
keywords_df = pd.read_csv(keywords_chosen, encoding='latin-1')   # "ISO-8859-1"
KEYWORDS_COLS = keywords_df.columns
lemma_keywords_df = pd.DataFrame(columns=KEYWORDS_COLS)
category_dict = {}
keywords_list = set()
for col in KEYWORDS_COLS:
    lemma_keywords_df[col] = keywords_df[col].astype(str).apply(lemmatize_keywords)
    category_dict[col] = set(lemma_keywords_df[col].tolist())
    try:
        category_dict[col].remove('')
    except:
        pass
    keywords_list = keywords_list.union(category_dict[col])
display(lemma_keywords_df)

# if there are punctuations in the keywords list, these punctuation will be kept regardless of puncturation removal step
for word in keywords_list:
    for char in word:
        if char in exclude:
            exclude.remove(char)
            
# Add all words in the given keyword list to pre-defined token dictionary
multi_word = [w.split('_') for w in keywords_list ]   #if '_' in w 
tokenizer = MWETokenizer(multi_word)

Unnamed: 0,Social,Economical,Environmental
0,famille,énergie,agriculture
1,ami,secteur,les_émission_de_gaz_à_effet_de_serre
2,copain,importation,gaz_à_effet_de_serre
3,copine,import,conservation
4,épouse,import,environnement
5,époux,industrie,durable
6,enfant,commerce_électronique,soutenable
7,changement,tendance_des_marché,changement_climatique
8,communauté,croissance,recyclage
9,culture,mondiale,gaz


In [7]:
for word in keywords_list:
    if len(word) < 3:
        print(word)

or


In [8]:
def lemmatize_text(row): 
    # we are using the original text in the caption_original column because
    # the script we originally used to clean the data didn't handle the French
    # text correctly and so the results in the caption_cleaned and hashtags
    # columns are corrupted and should not be used
    # however all the cleaning (tokenizing, removal of unnecessary punctuation,
    # and lemmatization is now done right here in this script and it's done
    # correctly for French
    text = str(row['caption_original']).replace('nan', '')
    #print(text)
    text = text.replace('’', '\'')
    tokens = tokenizer.tokenize(text.split())   
    # remove stop words
    stop_free = ' '.join(w for w in tokens if w.lower() not in stopWords and len(w) > 1)
    # remove punctuation
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    # lemmatize
    lemmas = ' '.join(lemma.lemmatize(word).lower() for word in punc_free.split() if len(lemma.lemmatize(word)) > 1)
    # remove stop words that appear after lemmatization
    stop_free_2 = ' '.join(w for w in lemmas.split() if w not in stopWords and len(w) > 1)
    #print(stop_free_2)
    return stop_free_2.split()

# assign a category if a keyword from that category is matched
def find_category(row):
    text = row['lemmatized_text']
    keywords_found = []
    category = set()
    for word in text:
        for col in KEYWORDS_COLS:
            if word in category_dict[col]:
                #print(word)
                #print(category_dict[col])
                keywords_found.append(word)
                category.add(col)
    if len(keywords_found) == 0:
        category.add('other')   
    return keywords_found, sorted(list(category))

In [9]:
pd.options.display.max_rows = 100
# read csv files and save targt columns to dataframe
filePaths = glob.glob(data_folder)  
for filename in filePaths:
    print(filename)
    basename = os.path.basename(filename)
    outputFileName = outputDir + basename
    data_df = pd.read_csv(filename, encoding = 'utf-8')
    if data_df.shape[0] < 1:
        print('this file is empty: ' + basename)
        data_df.to_csv(outputFileName, index=None) 
        continue
    #data_df['lang'] = data_df['caption_cleaned'].astype(str).apply(detect_lang)
    #data_df = data_df.drop(['words_matched_list'], axis=1)
    data_df = data_df.fillna('')
    #wrong_lang = data_df[data_df['lang'] != 'en'].shape[0]
    data_df['lemmatized_text'] = 'unknown'
    try:
        data_df['lemmatized_text'] = data_df.apply(lemmatize_text, axis=1)
    except:
        print('Cannot process file: ' + basename)
        print("Due to ValueError:", sys.exc_info()[1])
        continue
    data_df['matched_keywords'], data_df['category'] = zip(*data_df.apply(find_category, axis=1))
    
    #display(data_df[['words_matched_list', 'lemmatized_text','matched_keywords', 'category']])
    output_list = data_df.columns.tolist()
    output_list.remove('category')
    output_list.remove('words_matched_list')
    output_list.remove('lemmatized_text')
    output_list.append('category')
    output_df = data_df[output_list]
    output_df.to_csv(outputFileName, encoding='utf-8', index=None)    

./Accounts/QuestionsdargentCanada_facebook_statuses.csv
./Accounts/MPO_Centre_tweets.csv
./Accounts/PechesOceansCanada_facebook_statuses.csv
./Accounts/GCC_CCG_tweets.csv
./Accounts/GCAutochtones_tweets.csv
./Accounts/MPO_Science_tweets.csv
./Accounts/ressourcenaturellescanada_posts.csv
./Accounts/MPO_DFO_tweets.csv
./Accounts/TransportsetInfrastructureauCanada_facebook_statuses.csv
./Accounts/AAC_Canada_tweets.csv
./Accounts/EnvironnementetressourcesnaturellesauCanada_facebook_statuses.csv
./Accounts/GardeCotiereCanadienne_facebook_statuses.csv
./Accounts/ministre_ec_tweets.csv
./Accounts/MPO_GCC_Quebec_tweets.csv
./Accounts/OfficeDesTransportsDuCanada_facebook_statuses.csv
./Accounts/environnementcan_posts.csv
./Accounts/MineGiant_tweets.csv
./Accounts/MPO_TNL_tweets.csv
./Accounts/parcs.canada_posts.csv
./Accounts/MPO_Golfe_tweets.csv
./Accounts/ParcsCanada_facebook_statuses.csv
./Accounts/AgricultureCanadienne_facebook_statuses.csv
./Accounts/MPO_Pacifique_tweets.csv
./Accounts/env

In [10]:
output_df

Unnamed: 0.1,Unnamed: 0,id,date_published,link,caption_original,caption_cleaned,hashtags,num_shares,num_likes,Reactions_SUM,language,average_sentiment_score,sentiment,account_name,matched_keywords,category
0,0,928689911993110528,2017-11-09 18:25:03,[],RT @VeteransFR_CA: RT pour rendre hommage aux ...,RT RT pour rendre hommage aux rans canadiens V...,['#Lecanada'],2169,0,2169,fr,0.000000,Neutral,RNCan_tweeter,[],[other]
1,1,902587367462236160,2017-08-29 17:42:52,"['https://t.co/U3tTGAoKkw', 'https://t.co/2ddH...",Le changement climatique vous tient à cœur? Ai...,Le changement climatique vous tient ur Aidez n...,['#G'],56,68,124,fr,0.000000,Neutral,RNCan_tweeter,"[changement_climatique, avenir]","[Environmental, Social]"
2,2,899702457021984769,2017-08-21 18:39:16,"['https://t.co/ymXJGMc3vg', 'https://t.co/f6FR...",Le changement climatique vous tient à cœur? Ai...,Le changement climatique vous tient ur Aidez n...,['#G'],41,69,110,fr,0.000000,Neutral,RNCan_tweeter,"[changement_climatique, avenir]","[Environmental, Social]"
3,3,881145186969161728,2017-07-01 13:39:18,['https://t.co/Tk7SyvNZ6q'],RT @PMcanadien: Le PM invite les Canadiens à c...,RT PMcanadien Le PM invite les Canadiens brer ...,['#FeteDuC'],101,0,101,fr,0.000000,Neutral,RNCan_tweeter,[],[other]
4,4,918116142400622594,2017-10-11 14:08:40,['https://t.co/OnpO9Lxadv'],RT @PMcanadien: Chaque fille a la capacité de ...,RT PMcanadien Chaque fille la de changer notre...,['#Jou'],86,0,86,fr,0.000000,Neutral,RNCan_tweeter,[],[other]
5,5,897460824079556608,2017-08-15 14:11:49,"['https://t.co/ymXJGMc3vg', 'https://t.co/xzzL...",Moins consommer et mieux conserver l’énergie :...,Moins consommer et mieux conserver nergie comm...,['#G'],35,50,85,fr,0.000000,Neutral,RNCan_tweeter,[],[other]
6,6,935603698247589888,2017-11-28 20:17:58,[],RT @PMcanadien: Le premier ministre Justin Tru...,RT PMcanadien Le premier ministre Justin Trude...,[],77,0,77,fr,0.000000,Neutral,RNCan_tweeter,[ministre],[Economical]
7,7,877517660191428608,2017-06-21 13:24:48,[],RT @GCAutochtones: Célébrons les #Autochtones ...,RT GCAutochtones brons les Autochtones au pays...,"['#Autochtones', '#JNACana']",66,0,66,fr,0.000000,Neutral,RNCan_tweeter,"[autochtone, culture]",[Social]
8,8,912333914689212416,2017-09-25 15:12:10,['https://t.co/uyoEXpF9NW'],RT @CIF_IFC: #NationalForestWeek 🌲🌳 #DYK Canad...,RT NationalForestWeek DYK Canada home 28 world...,"['#NationalForestWeek', '#DYK', '#Canada150']",64,0,64,en,0.000000,Neutral,RNCan_tweeter,[],[other]
9,9,880794447365238785,2017-06-30 14:25:35,['https://t.co/t41BOAY0sR'],RT @PMcanadien: Célébrez la #FêteduCanada d’un...,RT PMcanadien brez la un autre Voyez les de ch...,"['#F', '#Can']",58,0,58,fr,0.000000,Neutral,RNCan_tweeter,[océan],[Environmental]
