<a href="https://colab.research.google.com/github/svlataki/DataChallenge/blob/main/Challenge_TfIdf_XGBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [34]:
import csv
import re
import zipfile
from io import BytesIO
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
!pip install greek-stemmer-pos
!pip install nltk
import nltk
import pickle
from nltk.stem import WordNetLemmatizer
from greek_stemmer import stemmer as greek_stemmer
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw')


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw to /root/nltk_data...
[nltk_data]   Package omw is already up-to-date!


True

In [5]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/My Drive/Data_Challenge

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/Data_Challenge


In [6]:
# Read training data
train_domains = list()
y_train = list()
with open("train.txt", 'r') as f:
    for line in f:
        l = line.split(',')
        train_domains.append(l[0])
        y_train.append(l[1][:-1])

# Read test data
test_domains = list()
with open("test.txt", 'r') as f:
    for line in f:
        l = line.split(',')
        test_domains.append(l[0])

In [7]:
# Read textual content of webpages of domain names
text = dict()
with zipfile.ZipFile('domains.zip', "r") as zfile:
    for filename in zfile.namelist():
        if re.search(r'\.zip$', filename) is not None:
            zfiledata = BytesIO(zfile.read(filename))
            with zipfile.ZipFile(zfiledata) as zfile2:
                text[filename[:-4]] = ''
                for name2 in zfile2.namelist():
                    file = zfile2.read(name2)
                    text[filename[:-4]] += file.decode('utf16') + ' '

In [8]:
# Retrieve textual content of domain names of the training set
train_data = list()
for domain in train_domains:
    if domain in text:
        train_data.append(text[domain])
    else:
        train_data.append('')

In [9]:
# Retrieve textual content of domain names of the test set
test_data = list()
for domain in test_domains:
    if domain in text:
        test_data.append(text[domain])
    else:
        test_data.append('')

# To reduce memory 
text = None

In [10]:
# preprocess data
def data_preprocessing(data):
    stemmer = WordNetLemmatizer()
    docs = []

    for doc in data:
        #remove special characthers

        document = re.sub("\\<.*?\\>", "",doc);        

        # Remove non-word characters such as numbers etc

        document = re.sub(r'\W', ' ', document)

        # Remove all single characters

        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)    

        # Remove single characters from the start

        document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)        

        # Substituting multiple spaces with single space

        document = re.sub(r'\s+', ' ', document, flags=re.I)    

        # remove digit characters

        document = re.sub(r'[0-9]', '', document)

        # Split the document in whitespaces (--> List of words)

        document = document.split()  

        # Lemmatize each word in the list

        document = [stemmer.lemmatize(word) for word in document]

        # stem each greek word in the list
        document = [greek_stemmer.stem_word(word,'NNM') for word in document]

        document = [w for w in document if len(w) > 3]

        # Reconstruct the document by joining the words using whitespace

        document = ' '.join(document)

        # Gather all the documents

        docs.append(document)



    return docs

In [None]:
train_data_preprocessed = data_preprocessing(train_data)
pickle.dump(train_data_preprocessed,open("train_data_preprocessed.pkl" ,'wb'))
test_data_preprocessed = data_preprocessing(test_data)
pickle.dump(test_data_preprocessed,open("test_data_preprocessed.pkl" ,'wb'))

In [13]:
with open('train_data_preprocessed.pkl', 'rb') as pickle_file:
    train_data_preprocessed = pickle.load(pickle_file)
with open('test_data_preprocessed.pkl', 'rb') as pickle_file:
    test_data_preprocessed = pickle.load(pickle_file)


In [24]:
from sklearn.model_selection import train_test_split

X_train, X_dev, Y_train, Y_dev = train_test_split(train_data_preprocessed, y_train, test_size=0.2,stratify=y_train, random_state=42)

In [25]:
# Create the training matrix. Each row corresponds to a domain name and each column to a word present in at least 10 webpages 
# and at most 50 webpages of domain names. The value of each entry in a row is equal to the tf-idf weight of that word in the 
# corresponding domain       
vec = TfidfVectorizer(decode_error='ignore', strip_accents='unicode', min_df=10, max_df=50, ngram_range=(1,3), stop_words=stopwords.words('english'))
X_train_transformed = vec.fit_transform(X_train)

# Create the test matrix following the same approach as in the case of the training matrix
X_test_transformed = vec.transform(test_data_preprocessed)

#print("Train matrix dimensionality: ", train_data_preprocessed.shape)
#print("Test matrix dimensionality: ", test_data_preprocessed.shape)

In [26]:
X_train_transformed.shape

(1006, 35475)

In [27]:
from sklearn.ensemble import BaggingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

best_clf = XGBClassifier(max_depth=5,n_estimators=250, learning_rate=0.1,
                         min_child_weight=5,gamma=0.1, subsample = 0.6, colsample_bytree=0.6,
                         objective='multi:softprob', seed=1997, reg_lambda=0.01)
best_clf = BaggingClassifier(base_estimator=best_clf, n_estimators=3,random_state=1997,bootstrap=False).fit(X_train_transformed,Y_train)


In [28]:
from sklearn.metrics import classification_report
from sklearn.metrics import log_loss

X_dev_transformed =  vec.transform(X_dev)

y_pred_val = best_clf.predict(X_dev_transformed)
y_pred_val_prob = best_clf.predict_proba(X_dev_transformed)

print(classification_report(Y_dev, y_pred_val))
print("Log Loss is ",log_loss(Y_dev,y_pred_val_prob)) 

              precision    recall  f1-score   support

           0       0.54      0.96      0.69        67
           1       0.94      0.57      0.71        28
           2       0.61      0.49      0.54        35
           3       0.63      0.77      0.69        69
           4       0.75      0.27      0.40        11
           5       0.00      0.00      0.00        10
           6       0.00      0.00      0.00        14
           7       0.00      0.00      0.00         6
           8       0.00      0.00      0.00         4
           9       0.00      0.00      0.00         8

    accuracy                           0.61       252
   macro avg       0.35      0.31      0.30       252
weighted avg       0.54      0.61      0.54       252

Log Loss is  1.3335695532969987


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Train in whole dataset ###


In [29]:

# Create the training matrix. Each row corresponds to a domain name and each column to a word present in at least 10 webpages 
# and at most 50 webpages of domain names. The value of each entry in a row is equal to the tf-idf weight of that word in the 
# corresponding domain       
vec = TfidfVectorizer(decode_error='ignore', strip_accents='unicode', min_df=10, max_df=50, ngram_range=(1,3), stop_words=stopwords.words('english'))
X_train_transformed = vec.fit_transform(train_data_preprocessed)
X_test_transformed = vec.fit_transform(test_data_preprocessed)


In [32]:
X_test_transformed = vec.fit_transform(test_data_preprocessed)


In [30]:
from sklearn.ensemble import BaggingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

best_clf = XGBClassifier(max_depth=5,n_estimators=250, learning_rate=0.1,
                         min_child_weight=5,gamma=0.1, subsample = 0.6, colsample_bytree=0.6,
                         objective='multi:softprob', seed=1997, reg_lambda=0.01)
best_clf = BaggingClassifier(base_estimator=best_clf, n_estimators=3,random_state=1997,bootstrap=False).fit(X_train_transformed,y_train)


In [None]:
pred = best_clf.predict(X_test_transformed)
y_pred = best_clf.predict_proba(X_test_transformed)

In [20]:
# Write predictions to a file
with open('sample_submission_xgboost_tfidf.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    lst = list()
    for i in range(10):
        lst.append('class_'+str(i))
    lst.insert(0, "domain_name")
    writer.writerow(lst)
    for i,test_host in enumerate(test_domains):
        lst = y_pred[i,:].tolist()
        lst.insert(0, test_host)
        writer.writerow(lst)

In [21]:
y_pred

array([[4.78210673e-02, 1.11523941e-01, 1.22909933e-01, ...,
        3.96031477e-02, 4.23424095e-02, 7.11465478e-02],
       [4.06271011e-01, 4.83447127e-02, 3.05153161e-01, ...,
        1.77581888e-02, 1.81083474e-02, 3.10245883e-02],
       [6.91385940e-04, 9.91723359e-01, 5.10249811e-04, ...,
        3.03476723e-03, 1.20857701e-04, 2.06061683e-04],
       ...,
       [3.31729203e-01, 8.61254036e-02, 1.04041293e-01, ...,
        2.70989370e-02, 3.24273296e-02, 5.55569455e-02],
       [9.02644455e-01, 1.19204642e-02, 2.08953638e-02, ...,
        3.62119661e-03, 4.98448871e-03, 7.42400484e-03],
       [2.11842880e-01, 2.20168546e-01, 1.00283362e-01, ...,
        1.83545705e-02, 2.19635740e-02, 3.76296602e-02]])