In [1]:
import functions as func
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Preprocessing

In [2]:
#define the path were datasets existing
path = 'data'

df_dict = func.readCSV_DATA(path)

In [3]:
df_train = df_dict['train.csv']
df_train = func.prepareData(df_train)
df_train['text_concat_filter']

0       What am I losing when using extension tubes in...
1       What is the distinction between a city and a s...
2       Maximum protusion length for through-hole comp...
3       Can an affidavit be used in Beit Din?\n\nAn af...
4       How do you make a binary image in Photoshop?\n...
                              ...                        
6074    Using a ski helmet for winter biking\n\nI am c...
6075    Adjustment to road bike brakes for high grade ...
6076    Suppress 'file truncated' messages when using ...
6077    When should a supervisor be a co-author?\n\nWh...
6078    Why are there so many different types of screw...
Name: text_concat_filter, Length: 6079, dtype: object

In [4]:
# Extracting simple text features
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [None]:

vectorizer = TfidfVectorizer(tokenizer=word_tokenize,
                             strip_accents='ascii',
                             stop_words='english',
                             min_df = 3,
                             max_df = int(df_train.shape[0]/30))
X = vectorizer.fit_transform(df_train['text_concat_filter'])
df_train.head()

In [None]:
y = df_train.iloc[:,11:41]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

# Train a Linear Regression in NN format

In [None]:
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense, Activation

In [None]:
X_train, X_test, y_train, y_test = X_train.toarray(), X_test.toarray(), y_train.as_matrix(), y_test.as_matrix()

## Create model

In [None]:
output_dim = y.shape[1]
input_dim  = len(vectorizer.vocabulary_)

model = Sequential() 
model.add(Dense(output_dim, input_dim=input_dim, activation='sigmoid')) 

batch_size = 128
nb_epoch = 20

In [None]:
model.compile(optimizer='adam', loss='mse', metrics=['mse'])

history = model.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=nb_epoch,
                    verbose=1,
                    validation_data = (X_test, y_test)) 

## Evaluate

In [None]:
print("MSE")
print("Training: %.2f" % model.evaluate(X_train, y_train, verbose=0)[1])
print("Testing : %.2f" % model.evaluate(X_test, y_test, verbose=0)[1])

In [None]:
y_hat_train = model.predict(X_train)
y_hat_test = model.predict(X_test)

In [None]:
from scipy.stats import spearmanr

spears_train = []
spears_test = []

for i in range(y.shape[1]):  
    spears_test.append(spearmanr(y_hat_test[:,i], y_test[:,i]))
    spears_train.append(spearmanr(y_hat_train[:,i], y_train[:,i]))

In [None]:
print('Spearman Correlation')
print("Training: %.2f" % np.mean([i.correlation for i in spears_train if ~np.isnan(i.correlation)]))
print("Testing : %.2f" % np.mean([i.correlation for i in spears_test if ~np.isnan(i.correlation)]))

# Submission

In [None]:
sub = df_dict['sample_submission.csv']
df_test = df_dict['test.csv']

In [None]:
df_test = func.prepareData(df_test)

X_test = vectorizer.transform(df_test['text_concat_filter'])

In [None]:
y_hat_test = model.predict(X_test.toarray())

In [None]:
for col_index, col in enumerate(y.columns.tolist()):
    sub[col] = y_hat_test[:, col_index]

In [None]:
sub.head()

In [None]:
sub.to_csv("submission.csv", index = False)