In [5]:
import lightgbm as lgb
import pandas as pd
import csv

In [6]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Kate\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Kate\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Kate\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
#read the data; for within - uncomment within and comment cross

cross_train_df = pd.read_csv('cross-topic/train_rand.csv',quotechar='"',quoting=csv.QUOTE_ALL,encoding='utf-8',escapechar='\\',doublequote=False, index_col='id')
cross_test_df = pd.DataFrame(columns = ['id','argument1','argument1_id','argument2','argument2_id','debate_id','is_same_side','topic'])
cross_test_df = pd.read_csv('cross-topic/test_rand.csv',quotechar='"',quoting=csv.QUOTE_ALL,encoding='utf-8',escapechar='\\',doublequote=False, index_col='id')
cross_dev_df = pd.DataFrame(columns = ['id','argument1','argument1_id','argument2','argument2_id','debate_id','is_same_side','topic'])
cross_dev_df = pd.read_csv('cross-topic/dev_rand.csv',quotechar='"',quoting=csv.QUOTE_ALL,encoding='utf-8',escapechar='\\',doublequote=False, index_col='id')

#within_train_df =  pd.read_csv('within-topic/test_rand.csv',quotechar='"',quoting=csv.QUOTE_ALL,encoding='utf-8',escapechar='\\',doublequote=False, index_col='id')
#within_test_df = pd.Dataframe(columns = ['id','argument1','argument1_id','argument2','argument2_id','debate_id','is_same_side','topic'])
#within_test_df =  pd.read_csv('within-topic/test_rand.csv',quotechar='"',quoting=csv.QUOTE_ALL,encoding='utf-8',escapechar='\\',doublequote=False, index_col='id')
#within_dev_df = pd.Dataframe(columns = ['id','argument1','argument1_id','argument2','argument2_id','debate_id','is_same_side','topic'])
#within_dev_df =  pd.read_csv('within-topic/test_rand.csv',quotechar='"',quoting=csv.QUOTE_ALL,encoding='utf-8',escapechar='\\',doublequote=False, index_col='id')

In [8]:
#lemmatization function
from nltk.corpus import wordnet
from nltk.tokenize import sent_tokenize, word_tokenize
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer


def get_wordnet_pos(treebank_tag):
        """
        return WORDNET POS compliance to WORDENT lemmatization (a,n,r,v) 
        """
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            # As default pos in lemmatization is Noun
            return wordnet.NOUN

def lemmatize_stemming(token, pos_tag):
    stemmer = SnowballStemmer("english") #pOrter, M. "An algorithm for suffix stripping."
    return stemmer.stem(WordNetLemmatizer().lemmatize(token, pos=pos_tag))

def preprocess(text):
    lemma = []
    for sentence in sent_tokenize(text):
        sentence = sentence.replace('\n', ' ').strip()
        tokens = [token for token in word_tokenize(sentence)]
        pos_tags = nltk.pos_tag(tokens)
        
        for idx in range(0,len(tokens)):
            token = tokens[idx].lower()
            if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
                wordnet_pos = get_wordnet_pos(pos_tags[idx][1])
                l_ = lemmatize_stemming(token, wordnet_pos)
                lemma.append(l_)
    return ' '.join(lemma)

def get_lemma(row):
    row['argument1_lemmas'] = preprocess(row['argument1'])
    row['argument2_lemmas'] = preprocess(row['argument2'])
    return row

In [9]:
#extraction ngrams function; 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
def extract_ngrams(X_train, X_test, X_dev, col, idx='id'):
    vectorizer = CountVectorizer(min_df=6, max_df=0.7, ngram_range=(3, 3), max_features=5000 )
    
    vectorizer.fit(X_train[col].values.astype('U'))
    features = vectorizer.transform(X_train[col].values.astype('U'))
    features_test = vectorizer.transform(X_test[col].values.astype('U'))
    features_dev = vectorizer.transform(X_dev[col].values.astype('U'))

    train_df =pd.DataFrame(
        features.todense(),
        columns=vectorizer.get_feature_names()
    )
    train_df = train_df.add_prefix(col)

    
    aid_df = X_train[[idx]]

    train_df = train_df.merge(aid_df, left_index =True, right_index=True, suffixes=(False, False), how='inner')
    train_df.set_index(idx, inplace=True)    
    
    test_df =pd.DataFrame(
        features_test.todense(),
        columns=vectorizer.get_feature_names()
    )
    test_df = test_df.add_prefix(col)

    
    aid_test_df = X_test[[idx]]

    test_df = test_df.merge(aid_test_df, left_index =True, right_index=True, suffixes=(False, False), how='inner')
    test_df.set_index(idx, inplace=True)
    
    dev_df =pd.DataFrame(
        features_dev.todense(),
        columns=vectorizer.get_feature_names()
    )
    dev_df = dev_df.add_prefix(col)

    
    aid_dev_df = X_dev[[idx]]

    dev_df = dev_df.merge(aid_dev_df, left_index =True, right_index=True, suffixes=(False, False), how='inner')
    dev_df.set_index(idx, inplace=True)
    return train_df, test_df, dev_df

def extract_n_grams_features(X_train, X_test, X_dev, columns, idx='id'): 

    X_train = X_train.reset_index()
    result_train_df =  X_train[[idx]]
    result_train_df.set_index(idx, inplace=True)
    
    X_test = X_test.reset_index()
    result_test_df =  X_test[[idx]]
    result_test_df.set_index(idx, inplace=True)
    
    X_dev = X_dev.reset_index()
    result_dev_df =  X_dev[[idx]]
    result_dev_df.set_index(idx, inplace=True)
    
    for col in columns:
        result_train_df_, result_test_df_, result_dev_df_ = extract_ngrams(X_train, X_test, X_dev, col)
        result_train_df = result_train_df.join(result_train_df_)
        result_test_df = result_test_df.join(result_test_df_)
        result_dev_df = result_dev_df.join(result_dev_df_)
    return result_train_df, result_test_df_, result_dev_df

In [10]:
def cut_df(df):
    X = df[['argument1', 'argument2', 'topic']]
    y = df[['is_same_side']]
    return X, y

In [None]:
# 1. Getting train, test and dev data; for within - uncomment within and comment cross
X_train, y_train = cut_df(cross_train_df)
X_test, y_test = cut_df(cross_test_df)
X_dev, y_dev = cut_df(cross_dev_df)
print('1')


#X_train, y_train = cut_df(within_train_df)
#X_test, y_test = cut_df(within_test_df)
#X_dev, y_dev = cut_df(within_dev_df)
#print('1')

# 2. Lemmatizing argument1 and argument2
X_train = X_train.apply(get_lemma, axis=1)
X_test = X_test.apply(get_lemma, axis=1)
X_dev = X_dev.apply(get_lemma, axis=1)


print('2')

In [None]:
#save lemmatized arguments
train = X_train
train['is_same_side'] = y_train
test = X_test
test['is_same_side'] = y_test
dev = X_dev
dev['is_same_side'] = y_dev

#for within - uncomment within and comment cross
train.to_csv("lemmatized/cross_train.csv")
test.to_csv("lemmatized/cross_test.csv")
dev.to_csv("lemmatized/cross_dev.csv")

#train.to_csv("lemmatized-data/within_train.csv")
#test.to_csv("lemmatized-data/within_test.csv")
#dev.to_csv("lemmatized-data/within_dev.csv")

In [22]:
# 3. Extracting features using CountVectorizer with ngrams_range = (3,3)
X_train_, X_test_, X_dev_ = extract_n_grams_features(X_train, X_test, X_dev, columns=['argument1_lemmas', 'argument2_lemmas'])

print('3')

3


In [23]:
#save n-grams-data
#within below
X_train_.to_csv("ngrams-cross/cross_train_CV3.csv")
X_test_.to_csv("ngrams-cross/cross_test_CV3.csv")
X_dev_.to_csv("ngrams-cross/cross_dev_CV3.csv")

#X_train_.to_csv("ngrams-data-within/within_train_CV3.csv")
#X_test_.to_csv("ngrams-data-within/within_test_CV3.csv")
#X_dev_.to_csv("ngrams-data-within/within_dev_CV3.csv")

In [24]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler(copy=True, with_mean=False)

scaler.fit(X_train_)
X_train = scaler.transform(X_train_)

scaler.fit(X_test_)
X_test = scaler.transform(X_test_)

scaler.fit(X_dev_)
X_dev = scaler.transform(X_dev_)

In [25]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
y_dev_ = y_dev['is_same_side'].tolist()
y_test_ = y_test['is_same_side'].tolist()
y_train_ = y_train['is_same_side'].tolist()

le.fit(y_dev_)
y_dev = le.transform(y_dev_)

le.fit(y_test_)
y_test = le.transform(y_test_)

le.fit(y_train_)
y_train = le.transform(y_train_)

y_dev = pd.Series(y_dev)
y_train = pd.Series(y_train)
y_test = pd.Series(y_test)

In [26]:
# create dataset for lightgbm; train on train/dev, then save results for test
lgb_train = lgb.Dataset(X_train, y_train)

num_test, num_feature = X_train.shape

In [27]:
import json
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import classification_report, confusion_matrix , accuracy_score, f1_score

def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities
    return 'f1', f1_score(y_true, y_hat), True

# specify your configurations as a dict
params = {
    'boosting_type': 'gbdt',
    'application': 'binary',
#    'metric': {'l2', 'l1'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

# generate feature names
feature_name = ['feature_' + str(col) for col in range(num_feature)]

print('Starting training...')
# train
evals_result = {}
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                valid_sets=lgb_train,
                early_stopping_rounds=5,
                #feval=lgb_f1_score,
                feature_name=feature_name,
                evals_result=evals_result)

print('Saving model...')
# save model to file
gbm.save_model('model.txt')

print('Dumping model to JSON...')
# dump model to JSON (and save to file)
model_json = gbm.dump_model()

with open('model.json', 'w+') as f:
    json.dump(model_json, f, indent=4)

# feature names
#print('Feature names:', gbm.feature_name())

# feature importances
#print('Feature importances:', list(gbm.feature_importance()))

print('Loading model to predict...')
# load model to predict
bst = lgb.Booster(model_file='model.txt')
# can only predict with the best iteration (or the saving iteration)
# after becoming results for dev: change X_dev to X_test
y_pred = bst.predict(X_dev, num_iteration=gbm.best_iteration)
# eval with loaded model
# after becoming results for dev: change X_dev to X_test
print("The rmse of loaded model's prediction is:", mean_squared_error(y_dev, y_pred) ** 0.5)

#print('Starting predicting...')
# predict
#y_pred = gbm.predict(X_dev, num_iteration=gbm.best_iteration)
# eval
#print('The rmse of prediction is:', mean_squared_error(y_dev.tolist(), y_pred.tolist()) ** 0.5)

#lgb.plot_metric(evals_result, metric='f1')

Starting training...
[1]	training's binary_logloss: 0.691384
Training until validation scores don't improve for 5 rounds.
[2]	training's binary_logloss: 0.690014
[3]	training's binary_logloss: 0.688868
[4]	training's binary_logloss: 0.687784
[5]	training's binary_logloss: 0.686804
[6]	training's binary_logloss: 0.68551
[7]	training's binary_logloss: 0.68459
[8]	training's binary_logloss: 0.68342
[9]	training's binary_logloss: 0.68261
[10]	training's binary_logloss: 0.681591
Did not meet early stopping. Best iteration is:
[10]	training's binary_logloss: 0.681591
Saving model...
Dumping model to JSON...
Loading model to predict...


In [None]:
#adjust threshold
threshold = 0.5
predictions = []
for i_pred in y_pred.tolist():
    if i_pred >= threshold: predictions.append(1)
    else: predictions.append(0)
        
# after becoming results for dev: change y_dev to y_test
print(classification_report(y_dev.tolist(), predictions))