In [469]:
from __future__ import print_function

import pandas as pd 
import glob
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import keras
from keras.layers import LSTM

In [470]:
vectorizer = TfidfVectorizer()

In [471]:
L = []
csv_num = 0
for csv in glob.glob('*.csv'):
#     if (csv != "coder_1.csv"):
    df1 = pd.read_csv(csv, encoding='latin-1')
    df1['File Number'] = csv
    L.append(df1)
    csv_num += 1
    
df = pd.concat(L, axis = 0, sort=False)
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df.columns.str.rstrip()
df.columns = df.columns.str.replace(' ', '_')

In [472]:
df.nunique()

Coder_Number               10
Question_ID              3508
Question_Link            3508
Question_Text            3940
File_Number                17
Euroskepticism_Score        3
Euroskepticism_Reason     578
Nationalism_Score           3
Nationalism_Reason        279
Populism_Score              3
Populism_Reason           273
dtype: int64

# Data Cleaning

In [473]:
df['Euroskepticism_Score'].unique()

array([nan, 0. , 0.5, 1. ])

In [474]:
df['Nationalism_Score'].unique()

array([nan, 0. , 0.5, 1. ])

In [475]:
df['Populism_Score'].unique()

array([nan, 0. , 1. , 0.5])

In [477]:
df.loc[~df['Nationalism_Score'].isin([0. , 0.5, 1. ])]['File_Number'].unique()
df.loc[df['Euroskepticism_Score'].isnull()]['File_Number'].unique()

array(['coderp2_0.csv', 'coderp2_1.csv', 'coderp2_2.csv', 'coderp2_3.csv',
       'coderp2_8.csv', 'coderp2_9.csv'], dtype=object)

In [479]:
df = df.dropna(subset=['Euroskepticism_Score', 'Populism_Score', 'Nationalism_Score'])
df['Euroskepticism_Score'] = df['Euroskepticism_Score'].astype('float')
df['Nationalism_Score'] = df['Nationalism_Score'].astype('float')
df['Populism_Score'] = df['Populism_Score'].astype('float')
df['Question_Text'] = df['Question_Text'].astype('str')

In [480]:
df.nunique()

Coder_Number               10
Question_ID              2861
Question_Link            2861
Question_Text            3267
File_Number                14
Euroskepticism_Score        3
Euroskepticism_Reason     578
Nationalism_Score           3
Nationalism_Reason        279
Populism_Score              3
Populism_Reason           273
dtype: int64

In [481]:
max_of_scores = dict()

for question_id in df['Question_ID'].unique():
    a = np.max(df[(df['Question_ID'] == question_id)]['Euroskepticism_Score'].fillna(0))
    b = np.max(df[(df['Question_ID'] == question_id)]['Nationalism_Score'].fillna(0))
    c = np.max(df[(df['Question_ID'] == question_id)]['Populism_Score'].fillna(0))
    T = df[(df['Question_ID'] == question_id)]['Question_Text']
#     file_num = df[(df['Question_ID'] == question_id)]['File Number']
    max_of_scores[question_id] = [T[T.index[0]], question_id, a, b, c]

scores = pd.DataFrame.from_dict(max_of_scores, orient = 'index')
scores = scores.rename(columns = {0: 'Question_Text', 1: 'Question_ID', 2: 'Euroskepticism_Score', 
                                        3: 'Nationalism_Score', 4: 'Populism_Score'})
scores.index = range(len(scores))

scores['Question_Text'] = scores['Question_Text'].apply(lambda x: str(x))
scores['Question_ID'] = scores['Question_ID'].apply(lambda x: str(x))
# scores['Question_ID'] = scores['File Number'].apply(lambda x: str(x))
for score_type in ['Euroskepticism_Score','Nationalism_Score', 'Populism_Score']:
    scores[score_type] = scores[score_type].astype('float')
    scores[score_type] = scores[score_type].apply(np.ceil)

In [482]:
scores

Unnamed: 0,Question_Text,Question_ID,Euroskepticism_Score,Nationalism_Score,Populism_Score
0,Further to the answer it gave to my Question E...,E-013527-13,0.0,0.0,0.0
1,In Italy as in many other Member States legal ...,E-001570-13,1.0,0.0,1.0
2,On 1 September 2013 Iraqi forces carried out a...,P-010965-13,0.0,0.0,0.0
3,In the Work Programme of the Agency for the Co...,E-011939-13,0.0,0.0,0.0
4,Identification marking on foods which are avai...,E-003034-13,0.0,0.0,0.0
5,According to estimates in Germany alone every ...,E-002177-13,0.0,0.0,0.0
6,In the context of the review of the EU policy ...,E-003105-13,1.0,0.0,0.0
7,Directive 2004/82/EC is intended to improve bo...,E-001109-13,0.0,0.0,0.0
8,Directive 2004/82/EC is intended to improve bo...,E-001108-13,0.0,0.0,0.0
9,On 27 February 2013 Mr Kallas on behalf of the...,E-005785-13,1.0,0.0,0.0


In [446]:
scores.to_csv("merged_scores.csv")

In [None]:
print(scores['Euroskepticism_Score'].unique())
print(scores['Nationalism_Score'].unique())
print(scores['Populism_Score'].unique())

# Ensemble Methods

In [492]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import matthews_corrcoef

In [493]:
X = vectorizer.fit_transform(scores['Question_Text'])

##### Euroskepticism

In [485]:
X_train, X_test, y_train, y_test = train_test_split(X, scores['Euroskepticism_Score'], test_size = .3)
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
y_pred = gbc.predict(X_test)

In [486]:
print("accuracy: {:.4f} \n".format(accuracy_score(y_test, y_pred)))
print("confusion matrix: \n",  confusion_matrix(y_test, y_pred), "\n")
print("matthews_corrcoef:  {:.4f}".format(matthews_corrcoef(y_test, y_pred)))

accuracy: 0.8359 

confusion matrix: 
 [[713  10]
 [131   5]] 

matthews_corrcoef:  0.0639


##### Nationalism

In [487]:
X_train, X_test, y_train, y_test = train_test_split(X, scores['Nationalism_Score'], test_size = .3)
gbc.fit(X_train, y_train)
y_pred = gbc.predict(X_test)

In [488]:
print("accuracy: {:.4f} \n".format(accuracy_score(y_test, y_pred)))
print("confusion matrix: \n",  confusion_matrix(y_test, y_pred), "\n")
print("matthews_corrcoef:  {:.4f}".format(matthews_corrcoef(y_test, y_pred)))

accuracy: 0.9255 

confusion matrix: 
 [[793   6]
 [ 58   2]] 

matthews_corrcoef:  0.0685


##### Populism

In [489]:
X_train, X_test, y_train, y_test = train_test_split(X, scores['Populism_Score'], test_size = .3)
gbc.fit(X_train, y_train)
y_pred = gbc.predict(X_test)

In [490]:
print("accuracy: {:.4f} \n".format(accuracy_score(y_test, y_pred)))
print("confusion matrix: \n",  confusion_matrix(y_test, y_pred), "\n")
print("matthews_corrcoef:  {:.4f}".format(matthews_corrcoef(y_test, y_pred)))

accuracy: 0.9092 

confusion matrix: 
 [[780   2]
 [ 76   1]] 

matthews_corrcoef:  0.0505


# 1D CNN for text classification with Keras

In [494]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.datasets import imdb

In [495]:
X

<2861x20246 sparse matrix of type '<class 'numpy.float64'>'
	with 256022 stored elements in Compressed Sparse Row format>

In [496]:
x_train, x_test, y_train, y_test = train_test_split(X, scores['Populism_Score'], test_size = .3)

In [497]:
max_features = 5000 #limit vocab 
maxlen = 400 #word sequence length
batch_size = 32
embedding_dims = 50
filters = 250
kernel_size = 3 #filter dim
hidden_dims = 250
epochs = 6


print('Build model...')
model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=X.shape[1]))
model.add(Dropout(0.2))

# word group filters of size filter_length:
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))

model.add(GlobalMaxPooling1D())

model.add(Dense(hidden_dims))
model.add(Dropout(0.2))  #fraction of neurons to drop
model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))

Build model...


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 2002 samples, validate on 859 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.callbacks.History at 0xf501509128>

In [None]:
y_pred = model.predict(x_test)
# x_train, x_test, y_train, y_test = train_test_split(X, scores['Populism_Score'], test_size = .3)
y_test_np = y_test.to_numpy() 

In [534]:
def perf_measure(y_actual, y_pred):
    TP = 0
    FP = 0
    TN = 0
    FN = 0

    for i in range(len(y_pred)): 
        if y_actual[i]==y_pred[i]==1:
           TP += 1
        if y_pred[i]==1 and y_actual[i]!=y_pred[i]:
           FP += 1
        if y_actual[i]==y_pred[i]==0:
           TN += 1
        if y_pred[i]==0 and y_actual[i]!=y_pred[i]:
           FN += 1

    A = np.array([[TN, FP], [FN, TP]])
    return A

In [537]:
#y_test = y_actual
pop_cm = perf_measure(y_test_np, np.argmax(y_pred, axis=1))

In [536]:
print("confusion matrix: \n",  pop_cm, "\n")
print("matthews_corrcoef:  {:.4f}".format(matthews_corrcoef(y_test_np, np.argmax(y_pred, axis=1))))

confusion matrix: 
 [[776   0]
 [ 83   0]] 

matthews_corrcoef:  0.0000


# LSTM with Keras 

In [None]:
# set parameters:
max_features = 5000
maxlen = 400
batch_size = 32
embedding_dims = 50
filters = 250
kernel_size = 3
hidden_dims = 250
# epochs = 15

In [None]:
print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 128))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy', #different loss function may be needed
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=2,
          validation_data=(x_test, y_test))
score, acc = model.evaluate(x_test, y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)