In [1]:
import time
import pandas as pd
import sklearn
import sklearn.ensemble as ske
import sklearn.linear_model as sklm
import sklearn.neural_network as sknn
import vaex
import vaex.ml
import vaex.ml.sklearn
import numpy as np

In [2]:
features = ['EventType', 'EventDetails', 'EventCountry', 'Actor1Code', 'Actor1Name']
target = 'AvgTone'

In [3]:
def vx_reader(folder):
    print("Starting read")
    read_time = time.time()
    vaex_df = vaex.open(folder,
                        dtype={'Actor1Code': str,
                          'Actor1Name': str,
                          'EventCode': str,
                          'ActionGeo_ADM1Code': str,
                          'SQLDATE': str})
    print("Read completed in", time.time() - read_time, "seconds")
    print("Starting cleanup")
    clean_time = time.time()
    vaex_df.dropna()
    vaex_df = vaex_df[vaex_df.NumMentions > 2000]
    vaex_df = vaex_df[vaex_df.SQLDATE > 20000000]
    vaex_df['EventType'] = vaex_df.EventCode.str.slice(start=0, stop=2)
    vaex_df['EventDetails'] = vaex_df.EventCode.str.slice(start=2)
    vaex_df['EventCountry'] = vaex_df.ActionGeo_ADM1Code.str.slice(start=0, stop=2)
    vaex_df['EventRegion'] = vaex_df.ActionGeo_ADM1Code.str.slice(start=2)
    vaex_df = vaex_df.drop(['EventCode', 'ActionGeo_ADM1Code'])
    vaex_df = vaex_df.dropna()
    print("Cleanup completed in", time.time() - clean_time, "seconds")
    test = vaex_df[vaex_df.SQLDATE > 20200000]
    train = vaex_df[vaex_df.SQLDATE < 20200000]
    return test, train, vaex_df


vx_test_df, vx_train_df, raw = vx_reader("data")

Starting read
Read completed in 0.9666252136230469 seconds
Starting cleanup
Cleanup completed in 1.1245028972625732 seconds


In [4]:
def binary_encode(train_df, test_df, feature_list):
    binary_encoder = vaex.ml.MultiHotEncoder(features=feature_list)
    binary_encoded_df = binary_encoder.fit_transform(train_df)
    binary_encoded_test = binary_encoder.transform(test_df)
    return binary_encoded_df, binary_encoded_test


bin_df, bin_test = binary_encode(vx_test_df, vx_test_df, features)

In [6]:
bin_pd = bin_df.to_pandas_df()
test_pd = bin_test.to_pandas_df()
raw_pd = raw.to_pandas_df()

In [7]:
def lin_reg(train_df, test_df):
    x_train = train_df.drop(['GLOBALEVENTID', 'SQLDATE', 'Actor1Code', 'Actor1Name', 'EventCountry', 'EventRegion', 'AvgTone'],
                      axis=1)
    x_test = test_df.drop(
        ['GLOBALEVENTID', 'SQLDATE', 'Actor1Code', 'Actor1Name', 'EventCountry', 'EventRegion', 'AvgTone'], axis=1)
    y_train = pd.DataFrame(train_df['AvgTone'])
    y_test = pd.DataFrame(test_df['AvgTone'])
    model = sklm.LinearRegression()
    model.fit(x_train, y_train)
    init_predictors = model.predict(x_train)
    predictors = model.predict(x_test)
    init_score = model.score(x_train, y_train)
    score = model.score(x_test, y_test)
    print(init_score)
    print(score)
    return predictors, init_predictors, score, init_score, x_train, x_test, y_train, y_test


lm_predictors, lm_init_predictors, lm_score, lm_init_score, lm_X, lm_X_test, lm_y, lm_y_test = lin_reg(bin_pd, test_pd)


0.25881717213078004
0.23761713626570946


In [None]:
def random_forest(train_df, test_df):
    x_train = train_df.drop(['GLOBALEVENTID', 'SQLDATE', 'Actor1Code', 'Actor1Name', 'EventCountry', 'EventRegion', 'AvgTone'],
                      axis=1)
    x_test = test_df.drop(
        ['GLOBALEVENTID', 'SQLDATE', 'Actor1Code', 'Actor1Name', 'EventCountry', 'EventRegion', 'AvgTone'], axis=1)
    y = np.ravel(pd.DataFrame(train_df['AvgTone']))
    y_test = np.ravel(pd.DataFrame(test_df['AvgTone']))
    model = ske.RandomForestRegressor(n_jobs=-1)
    model.fit(x_train, y)
    init_path = model.decision_path(x_train)
    path = model.decision_path(x_test)
    init_predictors = model.predict(x_train)
    predictors = model.predict(x_test)
    init_score = model.score(x_train, y)
    score = model.score(x_test, y_test)
    print(init_score)
    print(score)
    return predictors, init_predictors, score, init_score, path, init_path, x_train, x_test, y, y_test, model


rf_predictors, rf_init_predictors, rf_score, rf_init_score, rf_path, rf_init_path, rf_X, rf_X_test, rf_y, rf_y_test = random_forest(
    bin_pd, test_pd)

In [9]:
def svm(df, test_df):
    x_train = df.drop(['GLOBALEVENTID', 'SQLDATE', 'Actor1Code', 'Actor1Name', 'EventCountry', 'EventRegion', 'AvgTone'],
                axis=1)
    x_test = test_df.drop(
        ['GLOBALEVENTID', 'SQLDATE', 'Actor1Code', 'Actor1Name', 'EventCountry', 'EventRegion', 'AvgTone'], axis=1)
    y = np.ravel(pd.DataFrame(df['AvgTone']))
    y_test = np.ravel(pd.DataFrame(test_df['AvgTone']))
    model = sklearn.svm.SVR()
    model.fit(x_train, y)
    init_predictors = model.predict(x_train)
    predictors = model.predict(x_test)
    init_score = model.score(x_train, y)
    score = model.score(x_test, y_test)
    print(init_score)
    print(score)
    return predictors, init_predictors, score, init_score, x_train, x_test, y, y_test


svm_predictors, svm_init_predictors, svm_score, svm_init_score, svm_X, svm_X_test, svm_y, svm_y_test = svm(bin_pd, test_pd)

0.03180567915944965
0.026456959717162887


In [10]:
def nn(train_df, test_df):
    x_train = train_df.drop(['GLOBALEVENTID', 'SQLDATE', 'Actor1Code', 'Actor1Name', 'EventCountry', 'EventRegion', 'AvgTone'],
                      axis=1)
    x_test = test_df.drop(
        ['GLOBALEVENTID', 'SQLDATE', 'Actor1Code', 'Actor1Name', 'EventCountry', 'EventRegion', 'AvgTone'], axis=1)
    y = np.ravel(pd.DataFrame(train_df['AvgTone']))
    y_test = np.ravel(pd.DataFrame(test_df['AvgTone']))
    model = sknn.MLPRegressor(early_stopping=True, max_iter=1000, learning_rate='adaptive')
    model.fit(x_train.values, y)
    init_predictors = model.predict(x_train.values)
    predictors = model.predict(x_test.values)
    init_score = model.score(x_train.values, y)
    score = model.score(x_test.values, y_test)
    print(init_score)
    print(score)
    return predictors, init_predictors, score, init_score, x_train, x_test, y, y_test


nn_predictors, nn_init_predictors, nn_score, nn_init_score, nn_X, nn_X_test, nn_y, nn_y_test = nn(bin_pd, test_pd)

0.24273410823070907
0.20849826552559836


In [11]:
print(bin_pd.shape)
print(test_pd.shape)
print("R2's (Train/Test):")
print("LinReg:", lm_init_score, "/", lm_score)
print("Random Forest:", rf_init_score, "/", rf_score)
print("SVR:", svm_init_score, "/", svm_score)
print("NN:", nn_init_score, "/", nn_score)

(68048, 52)
(8364, 52)
R2's (Train/Test):
LinReg: 0.25881717213078004 / 0.23761713626570946
Random Forest: 0.9098906539922159 / 0.21151952228782245
SVR: 0.03180567915944965 / 0.026456959717162887
NN: 0.24273410823070907 / 0.20849826552559836
