In [1]:
import glob
import os
import pandas as pd

from langdetect import detect
import nltk
#nltk.download()   # comment after first download
from nltk.tokenize import wordpunct_tokenize, MWETokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import gensim
from gensim import corpora
import string
from numbers import Number
from pprint import pprint
import logging
import operator

pd.options.display.max_rows = 30

In [2]:
keywords_chosen = './EnglishData/Keywords_ECCC_EN.csv'

# list = [social_media_csv_filepath, cleaned_text_column_name, raw_text_column_name]
data_folder = './EnglishData/Accounts/*.csv'

CSV_COLUMNS = ['caption_cleaned', 'hashtags']

In [3]:
stopWords = set(stopwords.words('english'))
exclude = set(string.punctuation)
exclude.remove('_')
exclude.remove('-')
lemma = WordNetLemmatizer()


def detect_lang(text):
    try:
        lang = detect(text)
    except:
        return 'error'
    return lang


def lemmatize_keywords(col):
    return '_'.join(lemma.lemmatize(word).lower() for word in col.split()) 


def lemmatize_text(row):    
    text_cols = str(row['caption_cleaned']).lower(), str(row['hashtags']).lower()
    normalized_text = []
    if text_cols[0] == 'nan':
        return '', ''
    for text in text_cols:
        if isinstance(text, str):
            tokens = tokenizer.tokenize(text.split())
            
            stop_free = ' '.join(w for w in tokens if w not in stopWords and len(w) > 1)
            punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
            normalized = ' '.join(lemma.lemmatize(word).lower() for word in punc_free.split() if len(lemma.lemmatize(word)) > 2)
            #print(normalized)
            normalized_text.append(normalized.split())
        else:
            normalized_text = ['', '']
    return normalized_text[0], normalized_text[1]


def find_category(row):
    text_cols = row['lemmatized_caption_cleaned'], row['lemmatized_hashtags']
    keywords_found = []
    counter = {'Social': 0, 'Economical': 0, 'Environmental': 0}
    category = 'unknown'
    for text in text_cols:
        for word in text:
            if word in soc_list:
                counter['Social'] += 1
                keywords_found.append(word)
            if word in econ_list:
                counter['Economical'] += 1
                keywords_found.append(word)
            if word in env_list:
                counter['Environmental'] += 1
                keywords_found.append(word)
    if len(keywords_found) > 0:
        category = max(counter.items(), key=operator.itemgetter(1))[0]    
    return keywords_found, category

In [4]:
# load keywords list
pd.options.display.max_rows = 10
KEYWORD_COL = ['Social', 'Economical', 'Environmental']
keywords_df = pd.read_csv(keywords_chosen, encoding = "ISO-8859-1")
lemma_keywords_df = pd.DataFrame(columns=KEYWORD_COL)
for col in KEYWORD_COL:
    lemma_keywords_df[col] = keywords_df[col].astype(str).apply(lemmatize_keywords)
display(lemma_keywords_df)
soc_list = set(lemma_keywords_df['Social'].tolist())
soc_list.remove('nan')
econ_list = set(lemma_keywords_df['Economical'].tolist())
econ_list.remove('nan')
env_list = set(lemma_keywords_df['Environmental'].tolist())
env_list.remove('nan')
multi_word = [w.split('_') for w in soc_list.union(econ_list).union(env_list) ]   #if '_' in w 
print(multi_word)
tokenizer = MWETokenizer(multi_word)


Unnamed: 0,Social,Economical,Environmental
0,family,eating_local,biking
1,friend,energy,composting
2,public_concern,growth,recycling
3,spouse,infrastructure,climate_change
4,kid,circular_economy,low_carbon
...,...,...,...
83,,,summit
84,,,agri-food
85,,,guard
86,,,biology


[['civil'], ['green', 'economy'], ['culture'], ['air', 'quality'], ['sustainable', 'food'], ['green'], ['wildlife'], ['biology'], ['human', 'right'], ['tourism'], ['safe'], ['spouse'], ['discrimination'], ['gender'], ['carbon'], ['conserve'], ['integrated'], ['commission'], ['public', 'concern'], ['health'], ['gold'], ['alternative'], ['carbon', 'offset'], ['conservation'], ['household'], ['interdisciplinary'], ['agri-food'], ['kid'], ['deforestation'], ['resource'], ['biodiversity'], ['wetland'], ['hi-velocity', 'system'], ['park'], ['fishery'], ['emission'], ['economy', 'fund'], ['cost', 'benefit'], ['bioenergy'], ['child'], ['guard'], ['sea', 'level'], ['healthy'], ['geothermal'], ['justice'], ['equity'], ['minority'], ['eating', 'local'], ['green', 'tourism'], ['stakeholder'], ['provincal'], ['energy'], ['aboriginal', 'tourism'], ['ocean'], ['minister'], ['life', 'expectancy'], ['entrepreneurship'], ['gas'], ['arctic'], ['forest', 'practice'], ['water', 'use'], ['material'], ['spec

In [5]:
pd.options.display.max_rows = 1000
# read csv files and save targt columns to dataframe
filePaths = glob.glob(data_folder)  
for filename in filePaths:
    display(filename)
    #print(os.path.basename(filename))
    data_df = pd.read_csv(filename)
    data_df['lang'] = data_df['caption_cleaned'].astype(str).apply(detect_lang)
    wrong_lang = data_df[data_df['lang'] != 'en'].shape[0]    
    print(wrong_lang/len(data_df))
    data_df['lemmatized_caption_cleaned'], data_df['lemmatized_hashtags']  = zip(*data_df.apply(lemmatize_text, axis=1))
    data_df['keywords_found'], data_df['category'] = zip(*data_df.apply(find_category, axis=1))
    #data_df.to_csv(filename, index=None)
    display(data_df[['words_matched_list', 'lang', 'keywords_found', 'category']])

'./EnglishData/Accounts/CanadianAgriculture_facebook_statuses.csv'

0.0544280442804428


Unnamed: 0,words_matched_list,lang,keywords_found,category
0,"['gas', 'emissions ', 'greenhouse gas']",en,"[greenhouse_gas, emission]",Environmental
1,,en,[],unknown
2,['or'],en,[],unknown
3,,en,[],unknown
4,,en,[],unknown
5,"['future', 'futur']",en,[future],Social
6,,en,[],unknown
7,['policy'],en,[policy],Economical
8,['or'],en,[],unknown
9,['innovation'],en,[innovation],Economical


'./EnglishData/Accounts/CanadianCoastGuard_facebook_statuses.csv'

0.12692307692307692


Unnamed: 0,words_matched_list,lang,keywords_found,category
0,"['infrastructure', 'transport', 'armed forces']",en,"[coast, guard, guard, transport, infrastructur...",Environmental
1,"['or', 'ice', 'coast']",en,[growth],Economical
2,,en,[],unknown
3,['safe'],en,[safe],Social
4,"['guard', 'coast']",en,"[coast, guard, guard]",Environmental
5,"['students', 'safe']",en,"[student, safe]",Social
6,"['building', 'ice', 'coast', 'guard', 'air', '...",en,"[local, finance, coast, guard, guard]",Social
7,"['guard', 'or', 'lakes', 'ice', 'coast']",en,"[coast, guard, guard, coast, guard, guard, coa...",Environmental
8,"['water', 'or']",en,[],unknown
9,"['ice', 'coast', 'guard', 'coastal', 'mer', 'a...",en,"[coast, guard, guard, health, coastal, marine]",Environmental


'./EnglishData/Accounts/AskISED_tweets.csv'

0.0


Unnamed: 0,words_matched_list,lang,keywords_found,category
0,"['money', 'finance']",en,"[job, finance]",Economical


'./EnglishData/Accounts/AAFC_Canada_tweets.csv'

0.012238325281803542


Unnamed: 0,words_matched_list,lang,keywords_found,category
0,,en,[],unknown
1,['women'],en,[woman],Social
2,['justice'],en,[],unknown
3,['jobs'],en,[job],Economical
4,"['ice', 'mer']",en,[],unknown
5,,en,[],unknown
6,['community'],en,[community],Social
7,"['eau', 'or', 'mer']",en,[minister],Economical
8,['resilience'],en,[resilience],Social
9,"['archives', 'library', 'women']",en,[woman],Social
