In [75]:
import numpy as np
import os
import csv
import zipfile
from sklearn.feature_extraction.text import TfidfVectorizer
import networkx as nx
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from ge import DeepWalk, SDNE, LINE
from sklearn.naive_bayes import MultinomialNB
import re
from sklearn.ensemble import RandomForestClassifier
from gensim.models.wrappers import FastText
import nltk
from nltk.corpus import stopwords
import warnings
import sklearn.exceptions
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint
from keras_tqdm import TQDMNotebookCallback
from keras.layers.core import Dense, Dropout
from keras.optimizers import Adam
from keras import backend as K
from sklearn.preprocessing import LabelBinarizer

In [76]:
train_hosts = list()
y_train = list()
with open("../data/train.csv", 'r') as f:
    for line in f:
        l = line.split(',')
        train_hosts.append(l[0])
        y_train.append(l[1][:-1])

test_hosts = list()
with open("../data/test.csv", 'r') as f:
    for line in f:
        l = line.split(',')
        test_hosts.append(l[0])


text = dict()
filenames = os.listdir('/home/tgeorgiopoulos/Desktop/Data_Challenge/data/domains/')
for filename in filenames:
    if filename[-4:] == '.zip':
        z = zipfile.ZipFile('../data/domains/'+filename)
        contents = z.namelist()
        text[filename[:-4]] = ''
        for c in contents:
            f = z.open(c)
            data = f.read()
            text[filename[:-4]] += data.decode('utf16')
            f.close()

train_data = list()
pos_list_train = list()
for host in train_hosts:
    if host in text:
        train_data.append(text[host])
    else:
        pos_list_train.append([i for i,x in enumerate(train_hosts) if x == host][0]) # find the position to remove the entry

                
y_train_without_text = list( y_train[i] for i in pos_list_train )
y_train_with_text = list( y_train[i] for i in range(0,len(y_train)) if i not in pos_list_train )
train_hosts_without_text = list( train_hosts[i] for i in pos_list_train )
train_hosts_with_text = list( train_hosts[i] for i in range(0,len(train_hosts)) if i not in pos_list_train )


# Read webpages of the test set
test_data = list()
pos_list_test = list()
for host in test_hosts:
    if host in text:
        test_data.append(text[host])
    else:
        pos_list_test.append([i for i,x in enumerate(test_hosts) if x == host][0])

        
test_hosts_without_text = list( test_hosts[i] for i in pos_list_test )
test_hosts_with_text = list( test_hosts[i] for i in range(0,len(test_hosts)) if i not in pos_list_test )

## 1.) Nodes with text and graph

### a.) Using only text information

In [77]:
# We load the greek embeddings from AUEB
fasttext = FastText.load_fasttext_format('../data/grcorpus_def.bin')

In [78]:
# We concatenate 2 lists of greek stopwords
stop_words_1 = set(stopwords.words('greek'))
stop_words_2 = set(stopwords.words('/home/tgeorgiopoulos/Desktop/Data_Challenge/data/stopwords_greek.txt')) 
stop_words_greek = set()
for word in stop_words_2:
    stop_words_greek.add(word.lower())

stop_words = stop_words_1.union(stop_words_greek)

In [79]:
# We remove the urls and the stopwords from every text
def text_centroid(text, model):
    
    text_vec =[]
    counter = 0
    sent_text = nltk.sent_tokenize(text)
    for sentence in sent_text:
        #print(sentence)
        sentence = ' '.join(re.sub("(\w+:\/\/\S+)", " ", sentence).split()) # removing urls
        sentence = ' '.join(re.sub("[*#*]", " ", sentence).split()) # removing *#*
        sentence = sentence.lower()
        sent_tokenized = nltk.word_tokenize(sentence)
        #print(sent_tokenized)
        for word in sent_tokenized:
            #print(counter)
            if word not in stop_words:
                try:
                    if counter == 0:
                        text_vec = model[word.lower()]
                    else:
                        text_vec = np.add(text_vec, model[word.lower()])
                    counter+=1
                except:
                    pass
    
    return np.asarray(text_vec) / counter

In [80]:
# We create the training set for the nodes with text and graph using text information
X_train_with_text = []
for sentence in train_data:    
    X_train_with_text.append(text_centroid(sentence,fasttext))   
X_train_with_text = np.stack(X_train_with_text, axis=0)

X_test_with_text = []
for sentence in test_data:
    X_test_with_text.append(text_centroid(sentence,fasttext))   
X_test_with_text = np.stack(X_test_with_text, axis=0)

In [81]:
print("Train matrix with text dimensionality: ", X_train_with_text.shape)
print("Test matrix with text dimensionality: ", X_test_with_text.shape)

Train matrix with text dimensionality:  (677, 300)
Test matrix with text dimensionality:  (171, 300)


In [82]:
# Training using an MLP
mlb = LabelBinarizer()
ys = []
ys = mlb.fit_transform(y_train_with_text)


model2 = Sequential()
model2.add(Dense(1024, input_dim=X_train_with_text.shape[1] , activation='relu'))
model2.add(Dropout(0.1))
model2.add(Dense(256,  activation='relu'))
model2.add(Dropout(0.2))
model2.add(Dense(5,  activation='softmax'))

print(model2.summary())
model2.compile(loss='categorical_crossentropy',
                  optimizer=Adam(lr=0.001))

history2 = model2.fit(X_train_with_text, ys,
              batch_size=64,
              epochs=25,
              verbose = 1)

y_pred_with_text = model2.predict(X_test_with_text)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_35 (Dense)             (None, 1024)              308224    
_________________________________________________________________
dropout_22 (Dropout)         (None, 1024)              0         
_________________________________________________________________
dense_36 (Dense)             (None, 256)               262400    
_________________________________________________________________
dropout_23 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_37 (Dense)             (None, 5)                 1285      
Total params: 571,909
Trainable params: 571,909
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12

### b.) Using only graph information

In [83]:
# Create a directed graph
G = nx.read_edgelist('../data/edgelist.txt', delimiter=' ', create_using=nx.DiGraph())

print(G.number_of_nodes())
print(G.number_of_edges())

65208
1642073


In [84]:
model = DeepWalk(G,walk_length=20,num_walks=50,workers=4)#init model
model.train(embed_size=1000,window_size=5,iter=2, workers=4)# train model
embeddings = model.get_embeddings()# get embedding vectors

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:  2.1min finished


Learning embedding vectors...
Learning embedding vectors done!


In [85]:
# Create the training matrix. Each row corresponds to a web host.
X_train_with_text2 = np.zeros((len(train_hosts), 1000))
for i in range(len(train_hosts)):    
    X_train_with_text2[i,:] = embeddings[train_hosts[i]]
    
# Create the test matrix. Use the same 3 features as above
X_test_with_text2 = np.zeros((len(test_hosts_with_text), 1000))
for i in range(len(test_hosts_with_text)):
    X_test_with_text2[i,:] = embeddings[test_hosts_with_text[i]]

In [86]:
print("Train matrix with text2 dimensionality: ", X_train_with_text2.shape)
print("Test matrix with text2 dimensionality: ", X_test_with_text2.shape)

Train matrix with text2 dimensionality:  (801, 1000)
Test matrix with text2 dimensionality:  (171, 1000)


In [87]:
# Training using an MLP
mlb = LabelBinarizer()
ys = []
ys = mlb.fit_transform(y_train)


model4 = Sequential()
model4.add(Dense(1024, input_dim=X_train_with_text2.shape[1] , activation='relu'))
model4.add(Dropout(0.3))
model4.add(Dense(5,  activation='softmax'))

print(model4.summary())
model4.compile(loss='categorical_crossentropy',
                  optimizer=Adam(lr=0.001))

history4 = model4.fit(X_train_with_text2, ys,
              batch_size=64,
              epochs=7,
              verbose = 1)

y_pred_with_text2 = model4.predict(X_test_with_text2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_38 (Dense)             (None, 1024)              1025024   
_________________________________________________________________
dropout_24 (Dropout)         (None, 1024)              0         
_________________________________________________________________
dense_39 (Dense)             (None, 5)                 5125      
Total params: 1,030,149
Trainable params: 1,030,149
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


#### We average the predictions for nodes with graph and text

In [88]:
y_pred_with = (y_pred_with_text + y_pred_with_text2)/2

## 2.) Nodes with graph only

### a.) Using graph information

In [89]:
# We have the model ready
# Create the training matrix. Each row corresponds to a web host.
X_train_without_text = np.zeros((len(train_hosts), 1000))
for i in range(len(train_hosts)):    
    X_train_without_text[i,:] = embeddings[train_hosts[i]]
    
# Create the test matrix. Use the same 3 features as above
X_test_without_text = np.zeros((len(test_hosts_without_text), 1000))
#avg_neig_deg = nx.average_neighbor_degree(G, nodes=test_hosts)
for i in range(len(test_hosts_without_text)):
    X_test_without_text[i,:] = embeddings[test_hosts_without_text[i]]
    
print("Train matrix without text dimensionality: ", X_train_without_text.shape)
print("Test matrix without text dimensionality: ", X_test_without_text.shape)

Train matrix without text dimensionality:  (801, 1000)
Test matrix without text dimensionality:  (29, 1000)


In [94]:
y_pred_without_text = model4.predict(X_test_without_text)
y_pred_without_text.size

145

## 3.) Recompose the predictions for the test set with the correct sequence

In [95]:
y_pred = np.empty((len(test_hosts),5))
for i in range(0,len(pos_list_test)):
    y_pred[pos_list_test[i]] = y_pred_without_text[i]
    
pos_a = list()
for i in range(0,200):
    if i not in pos_list_test:
        pos_a.append(i)
        
for i in range(0,len(pos_a)):
    y_pred[pos_a[i]] = y_pred_with[i]

In [96]:
# Write predictions to a file
with open('sample_submission.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    lst = np.array(['athlitismos', 'diaskedasi-psyxagogia', 'eidiseis-mme',
       'katastimata-agores', 'pliroforiki-diadiktyo']).tolist()
    lst.insert(0, "Host")
    writer.writerow(lst)
    for i,test_host in enumerate(test_hosts):
        lst = y_pred[i,:].tolist()
        lst.insert(0, test_host)
        writer.writerow(lst)