In [1]:
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import sklearn
import os, sys
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
def get_rel_path(file_name):
    absolutepath = os.path.abspath('')
    #print(absolutepath)
    fileDirectory = os.path.dirname(absolutepath)
    file_path = os.path.join(fileDirectory, 'Data/' + str(file_name))   
    #print(file_path)
    return file_path

In [None]:
# DO NOT RUN THIS --> code I used to make preprocess input and generate dataframe 
# takes in a raw xml file, extracts the orthographic form & POS tag, and returns it as a bag of words df 
def make_dataframe(soup):
    orth_form = soup.find_all("orth")
    ctag = soup.find_all("ctag")
    
    # get pos tags 
    classes = []
    for i in range(0, len(orth_form)):
        ct = ctag[i].get_text().split(":")[0] #get the first tag  
        classes.append(ct)
    
    # make bow  
    context_window_list = []
    for i in range(0, len(orth_form)):
        row = []
        target_word = orth_form[i].get_text().lower()
        # get three words before:
        for j in reversed(range(1,4)):
            if (i-j) < 0: row.append("START")
            else: row.append(orth_form[i-j].get_text().lower())
        # add target word
        row.append(target_word)
        # get three words after:
        for j in range(1,4):
            if (i+j) > len(orth_form) - 1: row.append("END")
            else: row.append(orth_form[i+j].get_text().lower())
        context_window_list.append(row)
    
    #make bow df 
    context_window = np.array(context_window_list)
    context_window = np.transpose(context_window)
    data = {"a1": context_window[0], "a2": context_window[1], "a3": context_window[2], 
       "a4": context_window[3], "a5": context_window[4], "a6": context_window[5], 
       "a7": context_window[6], "class": classes}
    df = pd.DataFrame(data)
    return df 

#load train and validate datasets in dataframes
train = get_rel_path("train.xml")
train = open(train, 'r')
train_contents = train.read()
train_soup = BeautifulSoup(train_contents, 'xml')
train_df = make_dataframe(train_soup)

validate = get_rel_path("validate.xml")
validate = open(validate, 'r')
validate_contents = validate.read()
validate_soup = BeautifulSoup(validate_contents, 'xml')
validate_df = make_dataframe(validate_soup)

test = get_rel_path("Data/test-full-1.xml")
test = open(test, 'r')
test_contents = test.read()
test_soup = BeautifulSoup(test_contents, 'xml')
test_df = make_dataframe(test_soup)

#export to csv for easier loading
t_csv = get_rel_path("train_baseline.csv")
train_df.to_csv(t_csv, index = False)
v_csv = get_rel_path("validate_baseline.csv")
validate_df.to_csv(v_csv, index = False)
test_csv = get_rel_path("test_baseline.csv")
test_df.to_csv(test_csv, index = False)

In [5]:
# RUN THIS --> LOAD train & validate datasets from csv files
t_csv = get_rel_path("train_baseline.csv")
train_df = pd.read_csv(t_csv)

v_csv = get_rel_path("validate_baseline.csv")
validate_df = pd.read_csv(v_csv)

test_csv = get_rel_path("test_baseline.csv")
test_df = pd.read_csv(test_csv)

In [3]:
# combine validate_df + train_df to form the train test set:
frames = [train_df, validate_df]
train_df = pd.concat(frames)
train_df.head()

Unnamed: 0,a1,a2,a3,a4,a5,a6,a7,class
0,START,START,START,zabiję,cię,",",jeśli,fin
1,START,START,zabiję,cię,",",jeśli,umrzesz,ppron12
2,START,zabiję,cię,",",jeśli,umrzesz,!,interp
3,zabiję,cię,",",jeśli,umrzesz,!,"""",comp
4,cię,",",jeśli,umrzesz,!,"""",cieszy,fin


In [4]:
train_df.describe()

Unnamed: 0,a1,a2,a3,a4,a5,a6,a7,class
count,972600,972600,972600,972600,972600,972600,972600,972600
unique,111957,111957,111957,111956,111957,111957,111957,35
top,.,.,.,.,.,.,.,subst
freq,72727,72727,72727,72728,72728,72728,72728,265749


In [5]:
test_df.head()

Unnamed: 0,a1,a2,a3,a4,a5,a6,a7,class
0,START,START,START,rozumiem,",",że,olechowskiego,fin
1,START,START,rozumiem,",",że,olechowskiego,",",interp
2,START,rozumiem,",",że,olechowskiego,",",który,comp
3,rozumiem,",",że,olechowskiego,",",który,był,subst
4,",",że,olechowskiego,",",który,był,wtedy,interp


In [6]:
test_df.describe()

Unnamed: 0,a1,a2,a3,a4,a5,a6,a7,class
count,242913,242913,242913,242913,242913,242913,242913,242913
unique,48053,48053,48053,48052,48053,48053,48053,35
top,.,.,.,.,.,.,.,subst
freq,18216,18216,18216,18217,18217,18217,18217,65871


In [7]:
from sklearn import preprocessing
# Transform the training & validation data from symbols to numbers 
# Takes in a df of the correct format & returns a numerical representation of the data 
def make_vectors(df):
    labels = np.asarray(df['class'].astype("category").cat.codes.tolist())
    print('There are {} classes (POS labels) in this data.'.format(len(list(set(labels)))))
    labels[:5]
    
    X_vals = df.drop(columns=['class']).values
    #print("X Values:")
    #print(X_vals)
    
    le = preprocessing.LabelEncoder()
    le.fit(X_vals.ravel())
    X = le.transform(X_vals.ravel())
    X = X.reshape(len(X_vals), -1)
    len(list(set(X_vals.ravel())))
    #print("\nNumerical representation:")
    #print(X)
    return X 

In [8]:
# Transform the data from symbols to numbers 
X_train = make_vectors(train_df)
print(X_train)
print()
X_test = make_vectors(test_df)
print(X_test)

There are 35 classes (POS labels) in this data.
[[  1917   1917   1917 ...  10968     20  27402]
 [  1917   1917 101367 ...     20  27402  89525]
 [  1917 101367  10968 ...  27402  89525      0]
 ...
 [    21  64430  47300 ... 103451     44   1916]
 [ 64430  47300  33561 ...     44   1916   1916]
 [ 47300  33561 103451 ...   1916   1916   1916]]

There are 35 classes (POS labels) in this data.
[[  765   765   765 ...    16 47828 21666]
 [  765   765 31419 ... 47828 21666    16]
 [  765 31419    16 ... 21666    16 14265]
 ...
 [16108 40385  5722 ...     5    21   764]
 [40385  5722 45937 ...    21   764   764]
 [ 5722 45937     5 ...   764   764   764]]


# TRAIN and EVALUATE THE MODELS

In [9]:
# Define a function to take as input training and testing vectors and labels
# Allow this to be extensible to let multiple classifiers be used here
def buildClassifiers(clf, X_train, X_test, y_train, y_test):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    f1 = f1_score(y_test, y_pred, average="micro")
    precision = precision_score(y_test, y_pred, average="micro")
    recall = recall_score(y_test, y_pred, average="micro")

    return f1, precision, recall

In [16]:
# Construct the classifiers 
def train_and_evaluate(X_train, X_test, train_df, test_df):
    names = ['Naive_Bayes', 'Decision_Tree']
    classifiers = [GaussianNB(), 
                   DecisionTreeClassifier(random_state=42),
                  LogisticRegressionCV(cv=5, random_state=0)]
    aList, bList, cList = list(), list(), list()
    for name, clf in zip(names, classifiers):
        print('Now classifying', name)
        y_train = train_df["class"]
        y_test = test_df["class"]
        f1, precision, recall = buildClassifiers(clf, X_train, X_test,  
                                                 y_train = y_train, y_test = y_test)
        aList.append(f1)
        bList.append(precision)
        cList.append(recall)

        print("\tF1 for {}:\t\t".format(name), np.mean(aList))
        print("\tPrecision for {}:\t".format(name), np.mean(bList))
        print("\tRecall for {}:\t\t".format(name), np.mean(cList))
        print()
    return aList, bList, cList

In [17]:
print("---Performance with all features---")
overall_f1, overall_precision, overall_recall = train_and_evaluate(X_train = X_train, X_test = X_test, 
                                                                   train_df = train_df, test_df = test_df)

---Performance with all features---
Now classifying Naive_Bayes
	Average F1 for Naive_Bayes:		 0.31207469340875127
	Average Precision for Naive_Bayes:	 0.31207469340875127
	Average Recall for Naive_Bayes:		 0.31207469340875127

Now classifying Decision_Tree
	Average F1 for Decision_Tree:		 0.30504542778690313
	Average Precision for Decision_Tree:	 0.30504542778690313
	Average Recall for Decision_Tree:		 0.30504542778690313

