In [5]:
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import sklearn
import os, sys
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
def get_rel_path(file_name):
    absolutepath = os.path.abspath('')
    #print(absolutepath)
    fileDirectory = os.path.dirname(absolutepath)
    file_path = os.path.join(fileDirectory, 'Data/' + str(file_name))   
    #print(file_path)
    return file_path

In [6]:
# DO NOT RUN THIS --> code I used to make preprocess input and generate dataframe 
# takes in a raw xml file, extracts the orthographic form & POS tag, and returns it as a bag of words df 
def make_dataframe(soup, predictor, output):
    chunks = soup.find_all("chunk")
    ctag = soup.find_all("ctag")
    
    # make bow 
        # all lowercase 
        # add <s> and </s> markers 
    context_window_list = []
    for chunk in chunks:
        forms = chunk.find_all(predictor) #this will either be the orth form OR the lemma 
        for i in range(0, len(forms)):
            row = []
            target_word = forms[i].get_text()
            # get three words before:
            for j in reversed(range(1,4)):
                if (i-j) < 0: row.append("<s>") # add beginning of sentence marker 
                else: row.append(forms[i-j].get_text())
            # add target word
            row.append(target_word)
            # get three words after:
            for j in range(1,4):
                if (i+j) > len(forms) - 1: row.append("</s>") # add end of sentence marker 
                else: row.append(forms[i+j].get_text())

            context_window_list.append(row)
        
    # get pos tags 
    classes = []
    for i in range(0, len(ctag)):
        if output == "ctag": ct = ctag[i].get_text() #obtain entire ctag 
        else: ct = ctag[i].get_text().split(":")[0] # if POS tag specified --> get the first tag  
        classes.append(ct)
    
    #make bow df 
    context_window = np.array(context_window_list)
    context_window = np.transpose(context_window)
    data = {"a1": context_window[0], "a2": context_window[1], "a3": context_window[2], 
       "a4": context_window[3], "a5": context_window[4], "a6": context_window[5], 
       "a7": context_window[6], "class": classes}
    df = pd.DataFrame(data)
    return df 


In [8]:
# DO NOT RUN THIS <-- prepping train & test sets 
# prepare training & test sets 
#load train and validate datasets in dataframes
train = get_rel_path("train.xml")
train = open(train, 'r')
train_contents = train.read()
train_soup = BeautifulSoup(train_contents, 'xml')
train_df = make_dataframe(train_soup)

validate = get_rel_path("validate.xml")
validate = open(validate, 'r')
validate_contents = validate.read()
validate_soup = BeautifulSoup(validate_contents, 'xml')
validate_df = make_dataframe(validate_soup)

test = get_rel_path("Data/test-full-1.xml")
test = open(test, 'r')
test_contents = test.read()
test_soup = BeautifulSoup(test_contents, 'xml')
test_df = make_dataframe(test_soup)

# TEST 1: predictor = orth form ; output = full ctag
train1_df = make_dataframe(train_soup, predictor="orth", output="ctag")
validate1_df = make_dataframe(validate_soup, predictor="orth", output="ctag")
test1_df = make_dataframe(test_soup, predictor="orth", output = "ctag")
# export to csv for easier loading:
train1_df.to_csv(get_rel_path("train1_ec1.csv"), index = False)
validate1_df.to_csv(get_rel_path("validate1_ec1.csv"), index = False)
test1_df.to_csv(get_rel_path("test1_ec1.csv"), index = False)

# TEST 2: predictor = lemma; output = POS tag 
train2_df = make_dataframe(train_soup, predictor="base", output="pos")
validate2_df = make_dataframe(validate_soup, predictor="base", output="pos")
test2_df = make_dataframe(test_soup, predictor="base", output = "pos")
# export to csv for easier loading:
train2_df.to_csv(get_rel_path("train2_ec1.csv"), index = False)
validate2_df.to_csv(get_rel_path("validate2_ec1.csv"), index = False)
test2_df.to_csv(get_rel_path("test2_ec1.csv"), index = False)

# TEST 3: predictor = lemma; output = full tag 
train3_df = make_dataframe(train_soup, predictor="base", output="ctag")
validate3_df = make_dataframe(validate_soup, predictor="base", output="ctag")
test3_df = make_dataframe(test_soup, predictor="base", output = "ctag")
# export to csv for easier loading:
train3_df.to_csv(get_rel_path("train3_ec1.csv"), index = False)
validate3_df.to_csv(get_rel_path("validate3_ec1.csv"), index = False)
test3_df.to_csv(get_rel_path("test3_ec1.csv"), index = False)

In [7]:
# RUN THIS --> LOAD train & validate datasets from csv files
# Test 1 
train1_df = pd.read_csv(get_rel_path("train1_ec1.csv"))
validate1_df = pd.read_csv(get_rel_path("validate1_ec1.csv"))
test1_df = pd.read_csv(get_rel_path("test1_ec1.csv"))

# Test 2
train2_df = pd.read_csv(get_rel_path("train2_ec1.csv"))
validate2_df = pd.read_csv(get_rel_path("validate2_ec1.csv"))
test2_df = pd.read_csv(get_rel_path("test2_ec1.csv"))

# Test 3
train3_df = pd.read_csv(get_rel_path("train3_ec1.csv"))
validate3_df = pd.read_csv(get_rel_path("validate3_ec1.csv"))
test3_df = pd.read_csv(get_rel_path("test3_ec1.csv"))


In [4]:
# combine validate_df + train_df to form the train test set:
# Test 1: predictor = orth form ; output = full ctag
frames = [train1_df, validate1_df]
train1_df = pd.concat(frames)
print(train1_df.head(n=10))
print(train1_df.describe())
print()

# Test 2: predictor = lemma; output = POS tag 
frames = [train2_df, validate2_df]
train2_df = pd.concat(frames)
print(train2_df.head(n=10))
print(train2_df.describe())
print()

# Test 3: predictor = lemma; output = full tag 
frames = [train3_df, validate3_df]
train3_df = pd.concat(frames)
print(train3_df.head(n=10))
print(train3_df.describe())
print()

       a1       a2       a3       a4       a5       a6       a7  \
0     <s>      <s>      <s>   Zabiję      cię        ,    jeśli   
1     <s>      <s>   Zabiję      cię        ,    jeśli  umrzesz   
2     <s>   Zabiję      cię        ,    jeśli  umrzesz        !   
3  Zabiję      cię        ,    jeśli  umrzesz        !        "   
4     cię        ,    jeśli  umrzesz        !        "     </s>   
5       ,    jeśli  umrzesz        !        "     </s>     </s>   
6   jeśli  umrzesz        !        "     </s>     </s>     </s>   
7     <s>      <s>      <s>   Cieszy     fakt        ,       że   
8     <s>      <s>   Cieszy     fakt        ,       że    Royal   
9     <s>   Cieszy     fakt        ,       że    Royal    Canin   

                        class  
0             fin:sg:pri:perf  
1  ppron12:sg:acc:m1:sec:nakc  
2                      interp  
3                        comp  
4             fin:sg:sec:perf  
5                      interp  
6                      interp  
7     

In [11]:
from sklearn import preprocessing
# Transform the training & validation data from symbols to numbers 
# Takes in a df of the correct format & returns a numerical representation of the data 
def make_vectors(df):
    labels = np.asarray(df['class'].astype("category").cat.codes.tolist())
    print('There are {} classes in this data.'.format(len(list(set(labels)))))
    labels[:5]
    
    X_vals = df.drop(columns=['class']).values
    #print("X Values:")
    #print(X_vals)
    
    le = preprocessing.LabelEncoder()
    le.fit(X_vals.ravel())
    X = le.transform(X_vals.ravel())
    X = X.reshape(len(X_vals), -1)
    len(list(set(X_vals.ravel())))
    #print("\nNumerical representation:")
    #print(X)
    return X 

In [12]:
# Transform the data from symbols to numbers
# Test 1 : predictor = orth form ; output = full ctag
X_train1 = make_vectors(train1_df)
X_test1 = make_vectors(test1_df)
# Test 2 : predictor = lemma; output = POS tag 
X_train2 = make_vectors(train2_df)
X_test2 = make_vectors(test2_df)
# Test 3 : predictor = lemma; output = full tag 
X_test3 = make_vectors(test3_df)
X_train3 = make_vectors(train3_df)

There are 896 classes (POS labels) in this data.
There are 757 classes (POS labels) in this data.
There are 35 classes (POS labels) in this data.
There are 35 classes (POS labels) in this data.
There are 757 classes (POS labels) in this data.
There are 896 classes (POS labels) in this data.


# TRAIN and EVALUATE THE MODELS

In [13]:
# Define a function to take as input training and testing vectors and labels
# Allow this to be extensible to let multiple classifiers be used here
def buildClassifiers(clf, X_train, X_test, y_train, y_test):

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    f1 = f1_score(y_test, y_pred, average="micro")
    precision = precision_score(y_test, y_pred, average="micro")
    recall = recall_score(y_test, y_pred, average="micro")

    return f1, precision, recall

In [14]:
# Construct the classifiers 
def train_and_evaluate(X_train, X_test, train_df, test_df):
    names = ['Naive_Bayes', 'Decision_Tree']
    classifiers = [GaussianNB(), 
                   DecisionTreeClassifier(random_state=42)]
    aList, bList, cList = list(), list(), list()
    itr = 0
    for name, clf in zip(names, classifiers):
        print('Now classifying', name)
        y_train = train_df["class"]
        y_test = test_df["class"]
        f1, precision, recall = buildClassifiers(clf, X_train, X_test,  
                                                 y_train = y_train, y_test = y_test)
        aList.append(f1)
        bList.append(precision)
        cList.append(recall)

        print("\tF1 for {}:\t\t".format(name), aList[itr])
        print("\tPrecision for {}:\t".format(name), bList[itr])
        print("\tRecall for {}:\t\t".format(name), cList[itr])
        print()
        itr = itr + 1 
    return aList, bList, cList

In [15]:
print("Test 1: predictor = orth form ; output = full ctag")
train_and_evaluate(X_train = X_train1, X_test = X_test1, train_df = train1_df, test_df = test1_df)
print()
print("Test 2: predictor = lemma; output = POS tag ")
train_and_evaluate(X_train = X_train2, X_test = X_test2, train_df = train2_df, test_df = test2_df)
print()
print("Test 3: predictor = lemma; output = full tag ")
train_and_evaluate(X_train = X_train3, X_test = X_test3, train_df = train3_df, test_df = test3_df)

Test 1: predictor = orth form ; output = full ctag
Now classifying Naive_Bayes
	F1 for Naive_Bayes:		 0.1662899885967404
	Precision for Naive_Bayes:	 0.1662899885967404
	Recall for Naive_Bayes:		 0.1662899885967404

Now classifying Decision_Tree
	F1 for Decision_Tree:		 0.17127119586024625
	Precision for Decision_Tree:	 0.17127119586024625
	Recall for Decision_Tree:		 0.17127119586024625


Test 2: predictor = lemma; output = POS tag 
Now classifying Naive_Bayes
	F1 for Naive_Bayes:		 0.2727272727272727
	Precision for Naive_Bayes:	 0.2727272727272727
	Recall for Naive_Bayes:		 0.2727272727272727

Now classifying Decision_Tree
	F1 for Decision_Tree:		 0.3104609469233841
	Precision for Decision_Tree:	 0.3104609469233841
	Recall for Decision_Tree:		 0.3104609469233841


Test 3: predictor = lemma; output = full tag 
Now classifying Naive_Bayes
	F1 for Naive_Bayes:		 0.16782963447818766
	Precision for Naive_Bayes:	 0.16782963447818766
	Recall for Naive_Bayes:		 0.16782963447818766

Now class

([0.16782963447818766, 0.09685360602355578],
 [0.16782963447818766, 0.09685360602355576],
 [0.16782963447818766, 0.09685360602355576])