# ANALYSIS

In [1]:
# import modules 
import csv
import math
import numpy as np
import pandas as pd
import lda
from numpy import genfromtxt
from nltk import PorterStemmer


In [2]:
# Import data
data = genfromtxt('data/tdm.csv', delimiter=',',
                  skip_header=1, dtype=np.int64)

words = genfromtxt('data/tdm.csv', delimiter=',',
                  skip_header=0, dtype=np.str_)
words=words[0]

In [3]:
# Model LDA

model = lda.LDA(n_topics=20, n_iter=500, random_state=1)
model.fit(data) 

<lda.lda.LDA instance at 0x7f0a2975d7e8>

In [4]:
# Most frequent words in every topic
vocab=words
topic_word = model.topic_word_  
n_top_words = 9
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-n_top_words:-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

Topic 0: euro area banknot currenc bank coin countri changeov
Topic 1: global intern economi financi emerg world import globalis
Topic 2: euro area countri competit state econom inflat unit
Topic 3: market integr financi euro european area secur singl
Topic 4: polici area euro fiscal countri govern monetari economi
Topic 5: question think say said time bank thank come
Topic 6: bank central institut system supervisori level framework nation
Topic 7: polici see economi econom paper rate shock work
Topic 8: asset price risk effect invest market increas low
Topic 9: price inflat euro growth area expect remain econom
Topic 10: countri bank central exchang intern access role process
Topic 11: rate govern council polici growth monetari exchang develop
Topic 12: inflat central bank monetari polici expect decis strategi
Topic 13: growth product labour reform market structur increas employ
Topic 14: financi risk system market crisi stabil institut sector
Topic 15: statist area data euro inform e

In [8]:
# SAVE LDA SCORES 
np.savetxt("data/LDA.csv", model.doc_topic_, delimiter=",")

# Dictionary Methods

In [5]:
# Import dictionary from excel file
df = pd.read_excel("data/LoughranMcDonald_MasterDictionary_2014.xlsx", skiprows=0)

In [6]:

tokens = [str(x).lower() for x in df['Word']]
tokens=[PorterStemmer().stem(t) for t in tokens]


In [7]:
def GetDictScore(data, dictionary, words):
    
    score=[0 for o in range(len(data))]
    for i in range(len(words)):
        try:
            value=int(dictionary[words[i]])
            if value>0:
                for j in range(len(data)):
                    score[j]=score[j]+value*data[j][i]
        except:
            pass

    return score



In [8]:
DictScores=[]

dictionaries=["Negative", "Positive", "Uncertainty", "Litigious", "Constraining", "Superfluous","Interesting", "Harvard_IV"]


for d in dictionaries:
    score = [str(x).lower() for x in df[d]] # or any other method
    dictionary=dict(zip(tokens,score))
    score=GetDictScore(data,dictionary,words)
    DictScores.append(score)

In [46]:
myfile = open("data/Dictionary_Scores.csv", 'w')
wr = csv.writer(myfile)
wr.writerows(DictScores)




# TF-IDF WITH CORPUS

In [9]:
# copute tf-idf using corpus as input
# initialize
corpus=[]
with open('data/tdm.csv', 'rb') as csvfile:
    data = csv.reader(csvfile)
    for row in data:
        corpus.append(row)
del corpus[0]

### TF(D,V)

In [10]:
# compute TF(DV)
tf=[]
for i in range(len(corpus)):
    scores=[]
    for num in corpus[i]:
        if int(num)==0:
            value=0
        else:
            value=1+math.log(float(num))
            
        scores.append(value)
    tf.append(scores)
    

### idf(V)

In [11]:
# Compute idf(v)

idf=[0]*len(corpus[0])

for v in range(len(corpus[0])):
    score=0
    for d in range(len(corpus)):
        if int(corpus[d][v])>0:
            score=score+1
    idf[v]=score

### tf-idf

In [12]:
# Compute tf-idf using the previous tf and idf

tf_idf=tf

for d in range(len(corpus)):
    for v in range(len(corpus[0])):
        tf_idf[d][v]=tf[d][v]*idf[v]
        


In [18]:
myfile = open("data/TF_IDF.csv", 'wb')
wr = csv.writer(myfile)
wr.writerows(tf_idf)

# Dictionary with TF-IDF

In [13]:
# Compute dictinary scores with tf-idf input
tfidf = genfromtxt("data/TF_IDF.csv", delimiter=',',
                   skip_footer=1)

In [14]:
DictScores=[]

dictionaries=["Negative", "Positive", "Uncertainty", "Litigious", "Constraining", "Superfluous","Interesting", "Harvard_IV"]


for d in dictionaries:
    score = [str(x).lower() for x in df[d]] # or any other method
    dictionary=dict(zip(tokens,score))
    score=GetDictScore(tfidf,dictionary,words)
    DictScores.append(score)

In [30]:

myfile = open("data/Dictionary_Scores_tfidf.csv", 'wb')
wr = csv.writer(myfile)
wr.writerows(DictScores)