In [1]:
import lightgbm as lgb
import pandas as pd
import csv

In [2]:
#read lemmatized arguments; for within - uncomment the section below

#read lemmatized arguments; for within - uncomment within and comment cross
X_train = pd.read_csv('lemmatized-data/cross_train.csv',encoding='utf-8', index_col='id')
X_test =  pd.read_csv('lemmatized-data/cross_test.csv',encoding='utf-8', index_col='id')
X_dev =  pd.read_csv('lemmatized-data/cross_dev.csv',encoding='utf-8', index_col='id')

#X_train = pd.read_csv('lemmatized-data/within_train.csv',encoding='utf-8', index_col='id')
#X_test =  pd.read_csv('lemmatized-data/within_test.csv',encoding='utf-8', index_col='id')
#X_dev =  pd.read_csv('lemmatized-data/within_dev.csv',encoding='utf-8', index_col='id')

In [3]:
y_train = pd.DataFrame(columns = ['is_same_side']) 
y_train['is_same_side'] = X_train['is_same_side']
y_test = pd.DataFrame(columns = ['is_same_side']) 
y_test['is_same_side'] = X_test['is_same_side']
y_dev = pd.DataFrame(columns = ['is_same_side']) 
y_dev['is_same_side'] = X_dev['is_same_side']

X_train = X_train.drop(columns='is_same_side')
X_test = X_test.drop(columns='is_same_side')
X_dev = X_dev.drop(columns='is_same_side')

In [4]:
# Extracting features function; for different options change the options in the function! 
#Options: vectorizer = ContVectorizer or TfidfVectorizer; ngram_range=(3, 3), (1,1), (1,2)
#Options to check: CV3, CV1, CV1-2, Tfidf1, Tfidf1-2
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
def extract_ngrams(X_train, X_test, X_dev, col, idx='id'):
    vectorizer = TfidfVectorizer(min_df=6, max_df=0.7, ngram_range=(1, 1), max_features=5000 )
    
    vectorizer.fit(X_train[col].values.astype('U'))
    features = vectorizer.transform(X_train[col].values.astype('U'))
    features_test = vectorizer.transform(X_test[col].values.astype('U'))
    features_dev = vectorizer.transform(X_dev[col].values.astype('U'))

    train_df =pd.DataFrame(
        features.todense(),
        columns=vectorizer.get_feature_names()
    )
    train_df = train_df.add_prefix(col)

    
    aid_df = X_train[[idx]]

    train_df = train_df.merge(aid_df, left_index =True, right_index=True, suffixes=(False, False), how='inner')
    train_df.set_index(idx, inplace=True)    
    
    test_df =pd.DataFrame(
        features_test.todense(),
        columns=vectorizer.get_feature_names()
    )
    test_df = test_df.add_prefix(col)

    
    aid_test_df = X_test[[idx]]

    test_df = test_df.merge(aid_test_df, left_index =True, right_index=True, suffixes=(False, False), how='inner')
    test_df.set_index(idx, inplace=True)
    
    dev_df =pd.DataFrame(
        features_dev.todense(),
        columns=vectorizer.get_feature_names()
    )
    dev_df = dev_df.add_prefix(col)

    
    aid_dev_df = X_dev[[idx]]

    dev_df = dev_df.merge(aid_dev_df, left_index =True, right_index=True, suffixes=(False, False), how='inner')
    dev_df.set_index(idx, inplace=True)
    return train_df, test_df, dev_df

def extract_n_grams_features(X_train, X_test, X_dev, columns, idx='id'): 

    X_train = X_train.reset_index()
    result_train_df =  X_train[[idx]]
    result_train_df.set_index(idx, inplace=True)
    
    X_test = X_test.reset_index()
    result_test_df =  X_test[[idx]]
    result_test_df.set_index(idx, inplace=True)
    
    X_dev = X_dev.reset_index()
    result_dev_df =  X_dev[[idx]]
    result_dev_df.set_index(idx, inplace=True)
    
    for col in columns:
        result_train_df_, result_test_df_, result_dev_df_ = extract_ngrams(X_train, X_test, X_dev, col)
        result_train_df = result_train_df.join(result_train_df_)
        result_test_df = result_test_df.join(result_test_df_)
        result_dev_df = result_dev_df.join(result_dev_df_)
    return result_train_df, result_test_df_, result_dev_df

In [5]:
# Extracting features; for different options change the options in the function! 
#Options: CV3, CV1, CV1-2, Tfidf3, Tfidf1, Tfidf1-2
X_train_, X_test_, X_dev_ = extract_n_grams_features(X_train, X_test, X_dev, columns=['argument1_lemmas', 'argument2_lemmas'])


In [5]:
#save n-grams-data; change the name of the file for different options: CV3, CV1, CV1-2, Tfidf1, Tfidf1-2
#within below
X_train_.to_csv("ngrams-data-cross/cross_train_CV3.csv")
X_test_.to_csv("ngrams-data-cross/cross_test_CV3.csv")
X_dev_.to_csv("ngrams-data-cross/cross_dev_CV3.csv")

#X_train_.to_csv("ngrams-data-within/within_train_CV3.csv")
#X_test_.to_csv("ngrams-data-within/within_test_CV3.csv")
#X_dev_.to_csv("ngrams-data-within/within_dev_CV3.csv")

In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler(copy=True, with_mean=False)

scaler.fit(X_train_)
X_train = scaler.transform(X_train_)

scaler.fit(X_test_)
X_test = scaler.transform(X_test_)

scaler.fit(X_dev_)
X_dev = scaler.transform(X_dev_)

In [7]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
y_dev_ = y_dev['is_same_side'].tolist()
y_test_ = y_test['is_same_side'].tolist()
y_train_ = y_train['is_same_side'].tolist()

le.fit(y_dev_)
y_dev = le.transform(y_dev_)

le.fit(y_test_)
y_test = le.transform(y_test_)

le.fit(y_train_)
y_train = le.transform(y_train_)

y_dev = pd.Series(y_dev)
y_train = pd.Series(y_train)
y_test = pd.Series(y_test)

In [8]:
# create dataset for lightgbm; train on train, save results for dev and test
lgb_train = lgb.Dataset(X_train, y_train)

num_test, num_feature = X_train.shape

In [9]:
import json
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import classification_report, confusion_matrix , accuracy_score, f1_score

def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities
    return 'f1', f1_score(y_true, y_hat), True

# specify your configurations as a dict
params = {
    'boosting_type': 'gbdt',
    'application': 'binary',
#    'metric': {'l2', 'l1'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

# generate feature names
feature_name = ['feature_' + str(col) for col in range(num_feature)]

print('Starting training...')
# train
evals_result = {}
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                valid_sets=lgb_train,
                early_stopping_rounds=5,
                #feval=lgb_f1_score,
                feature_name=feature_name,
                evals_result=evals_result)

print('Saving model...')
# save model to file
gbm.save_model('model.txt')

print('Dumping model to JSON...')
# dump model to JSON (and save to file)
model_json = gbm.dump_model()

with open('model.json', 'w+') as f:
    json.dump(model_json, f, indent=4)

# feature names
#print('Feature names:', gbm.feature_name())

# feature importances
#print('Feature importances:', list(gbm.feature_importance()))

print('Loading model to predict...')
# load model to predict
bst = lgb.Booster(model_file='model.txt')
# can only predict with the best iteration (or the saving iteration)

# after becoming results for dev: change X_dev to X_test
y_pred = bst.predict(X_dev, num_iteration=gbm.best_iteration)

# eval with loaded model
# after becoming results for dev: change y_dev to y_test if y_test exists, otherwise comment the next line
print("The rmse of loaded model's prediction is:", mean_squared_error(y_dev, y_pred) ** 0.5)

#print('Starting predicting...')
# predict
#y_pred = gbm.predict(X_dev, num_iteration=gbm.best_iteration)
# eval
#print('The rmse of prediction is:', mean_squared_error(y_dev.tolist(), y_pred.tolist()) ** 0.5)

#lgb.plot_metric(evals_result, metric='f1')

Starting training...
[1]	training's binary_logloss: 0.689969
Training until validation scores don't improve for 5 rounds.
[2]	training's binary_logloss: 0.686609
[3]	training's binary_logloss: 0.683162
[4]	training's binary_logloss: 0.680282
[5]	training's binary_logloss: 0.677801
[6]	training's binary_logloss: 0.675408
[7]	training's binary_logloss: 0.673367
[8]	training's binary_logloss: 0.671366
[9]	training's binary_logloss: 0.669111
[10]	training's binary_logloss: 0.666895
Did not meet early stopping. Best iteration is:
[10]	training's binary_logloss: 0.666895
Saving model...
Dumping model to JSON...
Loading model to predict...
The rmse of loaded model's prediction is: 0.49046516904642723


In [10]:
#adjust threshold
threshold = 0.5
predictions = []
for i_pred in y_pred.tolist():
    if i_pred >= threshold: predictions.append(1)
    else: predictions.append(0)
        
# after becoming results for dev: change y_dev to y_test if y_test exists, otherwise comment the next line
print(classification_report(y_dev.tolist(), predictions))

              precision    recall  f1-score   support

           0       0.59      0.56      0.57      1788
           1       0.60      0.62      0.61      1872

    accuracy                           0.59      3660
   macro avg       0.59      0.59      0.59      3660
weighted avg       0.59      0.59      0.59      3660

