In [1]:
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import sklearn
import os, sys
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
def get_rel_path(file_name):
    absolutepath = os.path.abspath('')
    #print(absolutepath)
    fileDirectory = os.path.dirname(absolutepath)
    file_path = os.path.join(fileDirectory, 'Data/' + str(file_name))   
    #print(file_path)
    return file_path

In [None]:
# DO NOT RUN THIS --> code I used to make preprocess input and generate dataframe 
# takes in a raw xml file, extracts the orthographic form & POS tag, and returns it as a bag of words df 
def make_dataframe(soup):
    chunks = soup.find_all("chunk")
    ctag = soup.find_all("ctag")
    
    # make bow 
        # all lowercase 
        # add <s> and </s> markers 
    context_window_list = []
    for chunk in chunks:
        orth_forms = chunk.find_all("orth")
        for i in range(0, len(orth_forms)):
            row = []
            target_word = orth_forms[i].get_text()
            # get three words before:
            for j in reversed(range(1,4)):
                if (i-j) < 0: row.append("<s>") # add beginning of sentence marker 
                else: row.append(orth_forms[i-j].get_text())
            # add target word
            row.append(target_word)
            # get three words after:
            for j in range(1,4):
                if (i+j) > len(orth_forms) - 1: row.append("</s>") # add end of sentence marker 
                else: row.append(orth_forms[i+j].get_text())

            context_window_list.append(row)
        
    # get pos tags 
    classes = []
    for i in range(0, len(ctag)):
        ct = ctag[i].get_text().split(":")[0] #get the first tag  
        classes.append(ct)
    
    #make bow df 
    context_window = np.array(context_window_list)
    context_window = np.transpose(context_window)
    data = {"a1": context_window[0], "a2": context_window[1], "a3": context_window[2], 
       "a4": context_window[3], "a5": context_window[4], "a6": context_window[5], 
       "a7": context_window[6], "class": classes}
    df = pd.DataFrame(data)
    return df 

#load train and validate datasets in dataframes
train = get_rel_path("train.xml")
train = open(train, 'r')
train_contents = train.read()
train_soup = BeautifulSoup(train_contents, 'xml')
train_df = make_dataframe(train_soup)

validate = get_rel_path("validate.xml")
validate = open(validate, 'r')
validate_contents = validate.read()
validate_soup = BeautifulSoup(validate_contents, 'xml')
validate_df = make_dataframe(validate_soup)

test = get_rel_path("Data/test-full-1.xml")
test = open(test, 'r')
test_contents = test.read()
test_soup = BeautifulSoup(test_contents, 'xml')
test_df = make_dataframe(test_soup)

#export to csv for easier loading
t_csv = get_rel_path("train_improved.csv")
train_df.to_csv(t_csv, index = False)
v_csv = get_rel_path("validate_improved.csv")
validate_df.to_csv(v_csv, index = False)
test_csv = get_rel_path("test_improved.csv")
test_df.to_csv(test_csv, index = False)

In [4]:
# RUN THIS --> LOAD train & validate datasets from csv files
t_csv = get_rel_path("train_improved.csv")
train_df = pd.read_csv(t_csv)

v_csv = get_rel_path("validate_improved.csv")
validate_df = pd.read_csv(v_csv)

test_csv = get_rel_path("test_improved.csv")
test_df = pd.read_csv(test_csv)

In [20]:
# combine validate_df + train_df to form the train test set:
frames = [train_df, validate_df]
train_df = pd.concat(frames)
train_df.head(n=10)

Unnamed: 0,a1,a2,a3,a4,a5,a6,a7,class
0,<s>,<s>,<s>,Zabiję,cię,",",jeśli,fin
1,<s>,<s>,Zabiję,cię,",",jeśli,umrzesz,ppron12
2,<s>,Zabiję,cię,",",jeśli,umrzesz,!,interp
3,Zabiję,cię,",",jeśli,umrzesz,!,"""",comp
4,cię,",",jeśli,umrzesz,!,"""",</s>,fin
5,",",jeśli,umrzesz,!,"""",</s>,</s>,interp
6,jeśli,umrzesz,!,"""",</s>,</s>,</s>,interp
7,<s>,<s>,<s>,Cieszy,fakt,",",że,fin
8,<s>,<s>,Cieszy,fakt,",",że,Royal,subst
9,<s>,Cieszy,fakt,",",że,Royal,Canin,interp


In [21]:
train_df.describe()

Unnamed: 0,a1,a2,a3,a4,a5,a6,a7,class
count,972588,972587,972587,972587,972590,972591,972592,972600
unique,110130,115604,125745,126215,116249,110899,105735,35
top,<s>,<s>,<s>,.,.,</s>,</s>,subst
freq,204199,136983,68530,72728,72724,136983,204199,265749


In [22]:
test_df.head(n=10)

Unnamed: 0,a1,a2,a3,a4,a5,a6,a7,class
0,<s>,<s>,<s>,Rozumiem,",",że,Olechowskiego,fin
1,<s>,<s>,Rozumiem,",",że,Olechowskiego,",",interp
2,<s>,Rozumiem,",",że,Olechowskiego,",",który,comp
3,Rozumiem,",",że,Olechowskiego,",",który,był,subst
4,",",że,Olechowskiego,",",który,był,wtedy,interp
5,że,Olechowskiego,",",który,był,wtedy,po,adj
6,Olechowskiego,",",który,był,wtedy,po,zgoła,praet
7,",",który,był,wtedy,po,zgoła,innej,adv
8,który,był,wtedy,po,zgoła,innej,stronie,prep
9,był,wtedy,po,zgoła,innej,stronie,",",qub


In [23]:
test_df.describe()

Unnamed: 0,a1,a2,a3,a4,a5,a6,a7,class
count,242912,242911,242911,242911,242911,242912,242912,242913
unique,45230,47803,52424,52655,48636,46055,43587,35
top,<s>,<s>,<s>,.,.,</s>,</s>,subst
freq,51066,34245,17133,18217,18216,34245,51066,65871


In [36]:
from sklearn import preprocessing
# Transform the training & validation data from symbols to numbers 
# Takes in a df of the correct format & returns a numerical representation of the data 
def make_vectors(df):
    labels = np.asarray(df['class'].astype("category").cat.codes.tolist())
    print('There are {} classes (POS labels) in this data.'.format(len(list(set(labels)))))
    labels[:5]
    
    X_vals = df.drop(columns=['class']).values
    #print("X Values:")
    #print(X_vals)
    
    le = preprocessing.LabelEncoder()
    le.fit(X_vals.ravel())
    X = le.transform(X_vals.ravel())
    X = X.reshape(len(X_vals), -1)
    len(list(set(X_vals.ravel())))
    #print("\nNumerical representation:")
    #print(X)
    return X 

In [25]:
# Transform the data from symbols to numbers 
X_train = make_vectors(train_df)
print()
X_test = make_vectors(test_df)

There are 35 classes (POS labels) in this data.
X Values:
[['<s>' '<s>' '<s>' ... 'cię' ',' 'jeśli']
 ['<s>' '<s>' 'Zabiję' ... ',' 'jeśli' 'umrzesz']
 ['<s>' 'Zabiję' 'cię' ... 'jeśli' 'umrzesz' '!']
 ...
 ['-' 'prosił' 'nowotarżan' ... 'Zanussi' '.' '</s>']
 ['prosił' 'nowotarżan' 'Krzysztof' ... '.' '</s>' '</s>']
 ['nowotarżan' 'Krzysztof' 'Zanussi' ... '</s>' '</s>' '</s>']]

Numerical representation:
[[  1914   1914   1914 ...  42769     20  54879]
 [  1914   1914  34820 ...     20  54879 105789]
 [  1914  34820  42769 ...  54879 105789      0]
 ...
 [    21  84911  69972 ...  35240     44   1913]
 [ 84911  69972  14558 ...     44   1913   1913]
 [ 69972  14558  35240 ...   1913   1913   1913]]

There are 35 classes (POS labels) in this data.
X Values:
[['<s>' '<s>' '<s>' ... ',' 'że' 'Olechowskiego']
 ['<s>' '<s>' 'Rozumiem' ... 'że' 'Olechowskiego' ',']
 ['<s>' 'Rozumiem' ',' ... 'Olechowskiego' ',' 'który']
 ...
 ['matki' 'większej' 'do' ... "''" '.' '</s>']
 ['większej' 'do' 

# TRAIN and EVALUATE THE MODELS

In [26]:
# Define a function to take as input training and testing vectors and labels
# Allow this to be extensible to let multiple classifiers be used here
def buildClassifiers(clf, X_train, X_test, y_train, y_test):

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    f1 = f1_score(y_test, y_pred, average="micro")
    precision = precision_score(y_test, y_pred, average="micro")
    recall = recall_score(y_test, y_pred, average="micro")

    return f1, precision, recall

In [55]:
# Construct the classifiers 
def train_and_evaluate(X_train, X_test, train_df, test_df):
    names = ['Naive_Bayes', 'Decision_Tree']
    classifiers = [GaussianNB(), 
                   DecisionTreeClassifier(random_state=42)]
    aList, bList, cList = list(), list(), list()
    itr = 0
    for name, clf in zip(names, classifiers):
        print('Now classifying', name)
        y_train = train_df["class"]
        y_test = test_df["class"]
        f1, precision, recall = buildClassifiers(clf, X_train, X_test,  
                                                 y_train = y_train, y_test = y_test)
        aList.append(f1)
        bList.append(precision)
        cList.append(recall)

        print("\tF1 for {}:\t\t".format(name), aList[itr])
        print("\tPrecision for {}:\t".format(name), bList[itr])
        print("\tRecall for {}:\t\t".format(name), cList[itr])
        print()
        itr = itr + 1 
    return aList, bList, cList

In [56]:
print("---Performance with all features---")
overall_f1, overall_precision, overall_recall = train_and_evaluate(X_train = X_train, X_test = X_test, 
                                                                   train_df = train_df, test_df = test_df)
overall_metrics = [overall_f1, overall_precision, overall_recall]

---Performance with all features---
Now classifying Naive_Bayes
	F1 for Naive_Bayes:		 0.24172852008743873
	Precision for Naive_Bayes:	 0.24172852008743873
	Recall for Naive_Bayes:		 0.24172852008743873

Now classifying Decision_Tree
	F1 for Decision_Tree:		 0.36643571978444955
	Precision for Decision_Tree:	 0.36643571978444955
	Recall for Decision_Tree:		 0.36643571978444955



# FEATURE ENGINEERING

In [63]:
# function takes in performance metrics for each run & calculates difference from overall performance 
def calc_diff(indiv_f1, indiv_precision, indiv_recall):
    indiv_metrics = [indiv_f1, indiv_precision, indiv_recall]
    metrics = ["F1", "Precision", "Recall"]
    for model in range(0,2):
        print("\tPerformance difference for Model", model+1)
        for i in range(len(metrics)): # for each metric
            overall = overall_metrics[i][model]
            indiv = indiv_metrics[i][model]  
            diff = overall - indiv 
            print("\t\t-", metrics[i], " \t", diff)
    return

In [64]:
f1_list, precision_list, recall_list = [], [], [] #hold metrics for each indiv performance 
# leaves out a single attribute column each time & obtains performance metrics 
for index, column in enumerate(train_df):
    if index == 7: break
    print("----", column, " attribute removed")
    temp_train_df = train_df.copy()
    temp_train_df[column] = "" #keep column (so train & test have same # of columns) but remove all values 
    X_train_temp = make_vectors(temp_train_df)
    f1, precision, recall = train_and_evaluate(X_train = X_train_temp, X_test = X_test, 
                                               train_df = temp_train_df, test_df = test_df)
    calc_diff(f1, precision, recall)
    f1_list.append(f1)
    precision_list.append(precision)
    recall_list.append(recall)
    print()

---- a1  attribute removed
There are 35 classes (POS labels) in this data.
Now classifying Naive_Bayes
	F1 for Naive_Bayes:		 0.23797408948882934
	Precision for Naive_Bayes:	 0.23797408948882934
	Recall for Naive_Bayes:		 0.23797408948882934

Now classifying Decision_Tree
	F1 for Decision_Tree:		 0.3646737720912425
	Precision for Decision_Tree:	 0.3646737720912425
	Recall for Decision_Tree:		 0.3646737720912425

	Performance difference for Model 1
		- F1  	 0.0037544305986093873
		- Precision  	 0.0037544305986093873
		- Recall  	 0.0037544305986093873
	Performance difference for Model 2
		- F1  	 0.0017619476932070333
		- Precision  	 0.0017619476932070333
		- Recall  	 0.0017619476932070333

---- a2  attribute removed
There are 35 classes (POS labels) in this data.
Now classifying Naive_Bayes
	F1 for Naive_Bayes:		 0.24454845973661352
	Precision for Naive_Bayes:	 0.24454845973661352
	Recall for Naive_Bayes:		 0.24454845973661352

Now classifying Decision_Tree
	F1 for Decision_Tree:		