In [1]:
import glob
import os
import pandas as pd
from langdetect import detect
import nltk
#nltk.download()   # comment after first download
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import gensim
from gensim import corpora
import string
from pprint import pprint
import logging

pd.options.display.max_rows = 20

In [2]:
# set number of topics to classify
NUM_TOPICS = 3

In [3]:
# list = [social_media_csv_filepath, cleaned_text_column_name, raw_text_column_name]
tw_list = ['../tw/filtered_data_spell_corrected/*.csv', 'full_text_cleaned', 'text_original']
fb_list = ['../fb/filtered_data_spell_corrected/statuses/*.csv', 'status_message_cleaned', 'text_original']
in_list = ['../in/filtered_data_spell_corrected/posts/*.csv', 'caption', 'text_original']

In [4]:
stopWords_en = set(stopwords.words('english'))
stopWords_fr = set(stopwords.words('french'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()


# read csv files and save targt columns to dataframe
def import_csv_columns(list_loc):
    filePaths = glob.glob(list_loc[0])  
    df_loc = pd.DataFrame(columns=['cleaned_text', 'raw_text'])
    for filename in filePaths:
        #print(os.path.basename(filename))
        df_raw = pd.read_csv(filename)
        df_two_col = df_raw[[list_loc[1], list_loc[2]]]
        df_two_col.columns = df_loc.columns
        #display(len(df_two_col))
        df_loc = df_loc.append(df_two_col, ignore_index=True)
        #display(len(df_merge))
    df_loc = df_loc.dropna(axis=0, how='any')
    #print(len(df_loc))
    return df_loc


def detect_lang(text):
    try:
        lang = detect(text)
    except:
        return 'error'
    return lang


def normalize_text(row):   
    text = row['cleaned_text']
    stopWords = stopWords_en  
    if row['lang'] == 'fr':
        stopWords = stopWords_fr  
    stop_free = ' '.join([w for w in wordpunct_tokenize(text) if w.lower() not in stopWords
            and len(w) > 1 and w.isalnum()]) 
    punc_free = ''.join(ch.lower() for ch in stop_free if ch not in exclude)
    normalized = ' '.join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized.split()


In [5]:
# combine Twitter's tweets, Facebook's posts & Instagram's captions into dataframe 'df_merge'.
df_merge = pd.DataFrame(columns=['cleaned_text', 'raw_text'])
df_merge = df_merge.append(import_csv_columns(tw_list), ignore_index=True)
df_merge = df_merge.append(import_csv_columns(fb_list), ignore_index=True)
df_merge = df_merge.append(import_csv_columns(in_list), ignore_index=True)
df_merge

Unnamed: 0,cleaned_text,raw_text
0,: Droit devant avec ce mile nautique de plus! ...,RT @GCC_CCG: Droit devant avec ce mile nautiqu...
1,: étro déglaçage dans l'Arctique. Nous fournis...,RT @GCC_CCG: #JeudiRétro déglaçage dans l'Arct...
2,: Demandez à n'importe quel marin-de bons repa...,RT @GCC_CCG: Demandez à n'importe quel marin-d...
3,: Voyez le NGCC Cape Providence et le NGCC Thu...,RT @GCC_CCG: Voyez le NGCC Cape Providence et ...
4,Avis aux utilisateurs de surfaces glacées. égl...,Avis aux utilisateurs de surfaces glacées. #Dé...
5,Nous remercions &amp; offrons nos meilleurs vœ...,Nous remercions &amp; offrons nos meilleurs vœ...
6,: nouveaux refuges marins au large du Nunavut ...,RT @MPO_DFO: 7 nouveaux refuges marins au larg...
7,: Avis : se prépare à entamer ses opérations d...,RT @salledepresseGC: Avis : @GCC_CCG se prépar...
8,: étro le NGCC Terry Fox et son équipage font ...,RT @GCC_CCG: #JeudiRétro le NGCC Terry Fox et ...
9,: Notre Perlan perché entonne des cantiques av...,RT @MPO_Science: Notre Perlan perché entonne d...


In [6]:
# Detect language of a post. Several languages other than english or french are dectecd but makes up less than 1%
df_merge['lang'] = df_merge['cleaned_text'].apply(detect_lang)
df_merge.groupby('lang').count()

Unnamed: 0_level_0,cleaned_text,raw_text
lang,Unnamed: 1_level_1,Unnamed: 2_level_1
af,25,25
ca,55,55
cy,10,10
da,18,18
de,39,39
en,45806,45806
error,118,118
es,20,20
et,14,14
fi,4,4


In [7]:
# filter out exotic languages
df_merge = df_merge[(df_merge['lang'] == 'en') | (df_merge['lang'] == 'fr')]

In [None]:
# pre-processing text for LDA
df_merge['normalized_text'] = df_merge.apply(normalize_text, axis=1)
df_merge['normalized_text']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


0        [droit, devant, mile, nautique, plus, capitain...
1        [étro, déglaçage, arctique, fournissons, servi...
2        [demandez, importe, quel, marin, bons, repas, ...
3        [voyez, ngcc, cape, providence, ngcc, thunder,...
4        [avis, utilisateurs, surface, glacées, églaçag...
5        [remercions, amp, offrons, meilleurs, vœux, fi...
6        [nouveaux, refuge, marins, large, nunavut, amp...
7        [avis, prépare, entamer, opérations, déglaçage...
8        [étro, ngcc, terry, fox, équipage, font, escal...
9        [perlan, perché, entonne, cantiques, amis, le,...
                               ...                        
92367    [satellite, station, inuvik, receives, data, r...
92368    [year, ago, scientist, launched, canada, first...
92369    [blast, past, photo, taken, august, fourth, jo...
92370    [helping, municipality, improve, air, water, s...
92371    [natural, resource, canada, frank, de, rosiers...
92372    [alice, wilson, first, female, geologist, geol.

In [None]:
# train two LDA models, one for 'English', the other for 'French'
for lang in ['en', 'fr']:
    df_sub = df_merge[df_merge['lang'] == lang]
    doc_clean = df_sub['normalized_text'].tolist() 
    
    # Creating the term dictionary of our courpus, where every unique term is assigned an index. 
    dictionary = corpora.Dictionary(doc_clean)
    dictionary.save('LDA_dictionary_' + lang + '.dict')  # store the dictionary, for future reference
    
    # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
    corpus = [dictionary.doc2bow(doc) for doc in doc_clean]
    corpora.MmCorpus.serialize('LDA_corpus_' + lang + '.mm', corpus)
    pprint(len(dictionary.token2id))
    
    logging.basicConfig(filename='lda_model_' + lang + '.log', format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    
    print('training model for ' + lang)
    # Running and Training LDA model on the document term matrix.
    ldamodel = gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary, passes=5)
    
    # save results
    result = ldamodel.show_topics(num_topics=NUM_TOPICS, num_words=100, formatted=False)
    df_concat = pd.DataFrame()
    for i in range(0, NUM_TOPICS):
        df_tmp = pd.DataFrame(result[i][1], columns=['#' + str(result[i][0]) + '_word', '#' + str(result[i][0]) + '_prob'])
        df_concat = pd.concat([df_concat, df_tmp], axis=1)
        #display(df_concat)
    df_concat.to_csv('../LDA_classify_topics_' + lang + '.csv', index=None) 

20948
training model for en
29631
training model for fr


In [None]:
## Experiment on LSI model

# extract 3 LSI topics; use the default one-pass algorithm
lsi = gensim.models.lsimodel.LsiModel(corpus=corpus, id2word=dictionary, num_topics=NUM_TOPICS)
# print the most contributing words (both positively and negatively) for each of the first ten topics
lsi.print_topics()