# Importing Libraries

In [53]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.corpus import PlaintextCorpusReader
import os
import re
# Ignore Warnings
import warnings
warnings.filterwarnings('ignore')

In [220]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, recall_score, precision_score, precision_recall_curve, roc_auc_score, roc_curve
from sklearn.model_selection import GridSearchCV
from sklearn import svm

In [4]:
PATH = "/ITB/CAD-IT Machine Learning Engineer Test 2022 V1.0/Q3"

# Data Preprocessing

## Combining text files

In [164]:
# create function to append every txt file

def getTxt(path):
    main_corpus = PlaintextCorpusReader(path, '.*')

    # list of files
    corpus_list = []

    for filename in os.listdir(path):
        if os.path.isfile(filename):
            if filename.endswith('.txt'):
                corpus_list.append(filename)
    
     # combine sentences from all corpus into one list
    documents = []
    for corpus in corpus_list:
        corpus = main_corpus.raw(corpus)
        d = corpus.strip().split('\n')
        for s in d:
            s = s.strip()
            s = ' '.join(s.split())
            documents.append(s)
            
    return documents

In [165]:
txt_labeled = getTxt(PATH)
txt_labeled[:5]

['### abstract ###',
 'MISC The Minimum Description Length principle for online sequence estimation/prediction in a proper learning setup is studied',
 'MISC If the underlying model class is discrete, then the total expected square loss is a particularly interesting performance measure: (a) this quantity is finitely bounded, implying convergence with probability one, and (b) it additionally specifies the convergence speed',
 'MISC For MDL, in general one can only have loss bounds which are finite but exponentially larger than those for Bayes mixtures',
 'AIMX We show that this is even the case if the model class contains only Bernoulli distributions']

## Split labels and text

In [166]:
sentences = []
labels = []

for txt in txt_labeled:
    # check for headers containing ###
    if txt[:3] != '###':
        # get sentence
        sentence = txt[5:]
        label = txt[:4]

        sentences.append(sentence)
        labels.append(label)

print(sentences[:4])
print(labels[:4])

['The Minimum Description Length principle for online sequence estimation/prediction in a proper learning setup is studied', 'If the underlying model class is discrete, then the total expected square loss is a particularly interesting performance measure: (a) this quantity is finitely bounded, implying convergence with probability one, and (b) it additionally specifies the convergence speed', 'For MDL, in general one can only have loss bounds which are finite but exponentially larger than those for Bayes mixtures', 'We show that this is even the case if the model class contains only Bernoulli distributions']
['MISC', 'MISC', 'MISC', 'AIMX']


## Cleaning sentences

In [167]:
stops = set(stopwords.words('english'))

In [168]:
# decontracting words in english so it have better meaning
def decontract(phrase):
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [169]:
def clean_sentence(sentences, stopwords):
  preprocessed_sentences = []
  for sentence in sentences:
    decontract(sentence)
    # removing extra spaces and numbers
    sentence = re.sub("\S*\d\S*", "", sentence).strip()
    # removing non alphabels
    sentence = re.sub('[^A-Za-z]+', ' ', sentence)
    # https://gist.github.com/sebleier/554280
    sentence = ' '.join(e.lower() for e in sentence.split() if e.lower() not in stopwords)
    preprocessed_sentences.append(sentence.strip())
  return preprocessed_sentences

In [170]:
sentences_processed = clean_sentence(sentences, stops)
print(sentences_processed[:4])
print(labels[:4])

['minimum description length principle online sequence estimation prediction proper learning setup studied', 'underlying model class discrete total expected square loss particularly interesting performance measure quantity finitely bounded implying convergence probability one b additionally specifies convergence speed', 'mdl general one loss bounds finite exponentially larger bayes mixtures', 'show even case model class contains bernoulli distributions']
['MISC', 'MISC', 'MISC', 'AIMX']


In [171]:
X = sentences
y = labels

# Modeling

In [191]:
def classif(x, y, estimator):
    y_pred = estimator.predict(x)
    print(classification_report(y, y_pred))

In [212]:
vectorizer = TfidfVectorizer()

In [213]:
X_vector = vectorizer.fit_transform(X)
X_vector = X_vector.toarray()
X_vector.shape

(3117, 4010)

In [215]:
# Split the labeled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vector, y, test_size=0.2, random_state=0)

In [222]:
classifierSVM = svm.SVC()
classifierSVM.fit(X_train, y_train)

In [223]:
classif(X_test, y_test, classifierSVM)

              precision    recall  f1-score   support

        AIMX       0.88      0.57      0.69        37
        BASE       1.00      0.22      0.36         9
        CONT       0.90      0.29      0.44        31
        MISC       0.90      0.97      0.94       380
        OWNX       0.81      0.87      0.84       167

    accuracy                           0.88       624
   macro avg       0.90      0.58      0.65       624
weighted avg       0.88      0.88      0.86       624



In [216]:
classifierRF = RandomForestClassifier()
classifierRF.fit(X_train, y_train)

In [217]:
classif(X_test, y_test, classifierRF)

              precision    recall  f1-score   support

        AIMX       0.88      0.59      0.71        37
        BASE       1.00      0.22      0.36         9
        CONT       0.55      0.39      0.45        31
        MISC       0.90      0.94      0.92       380
        OWNX       0.79      0.86      0.82       167

    accuracy                           0.86       624
   macro avg       0.82      0.60      0.65       624
weighted avg       0.86      0.86      0.85       624



In [218]:
classifierDT = DecisionTreeClassifier()
classifierDT.fit(X_train, y_train)

In [219]:
classif(X_test, y_test, classifierDT)

              precision    recall  f1-score   support

        AIMX       0.77      0.73      0.75        37
        BASE       0.62      0.56      0.59         9
        CONT       0.43      0.58      0.49        31
        MISC       0.91      0.92      0.91       380
        OWNX       0.85      0.80      0.82       167

    accuracy                           0.85       624
   macro avg       0.72      0.72      0.71       624
weighted avg       0.86      0.85      0.86       624

