In [None]:
import os
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from Levenshtein import distance
from scipy.stats import uniform, randint
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
workingDir = os.path.abspath(os.path.join(''))

In [None]:
# set global var
language = 'german'

ngram_range = (1, 2)

In [None]:
# define scroing metrics
scoring = ["accuracy", "precision_micro", "recall_micro", "f1_micro", "precision_macro", "recall_macro", "f1_macro"]
refit = 'f1_macro'

In [None]:
def spell_correction(word, corpus):
    # Check if the word is not in the corpus
    if word not in corpus:
        # Calculate the Levenshtein distance between the word and each word in the corpus
        distances = [distance(word, c) for c in corpus]
        # Find the index of the minimum distance
        min_index = distances.index(min(distances))
        # If the minimum distance is less than or equal to the length of the word divided by 10 plus 1,
        # replace the word with the closest word from the corpus
        if len(word) / 10 + 1 >= min(distances):
            word = corpus[min_index]
    return word

### read df

In [None]:
df_train = pd.read_csv(filepath_or_buffer=os.path.join(workingDir, 'data', 'train.csv'))
df_test = pd.read_csv(filepath_or_buffer=os.path.join(workingDir, 'data', 'test.csv'))

correct test set vocabulary

In [None]:
all_words = [word for sentence in df_train[f"input_{language}"] for word in sentence.split()]
# Create a corpus as a set of unique words
corpus = list(set(all_words))

In [None]:
documents = []

# Iterate over each sentence in the test dataframe
for sen in range(0, len(df_test[f"input_{language}"])):
    # Convert the sentence to a string
    document = str(df_test[f"input_{language}"][sen])
    # Split the sentence into words
    document = document.split()
    # Apply spell correction to each word in the sentence
    document = [spell_correction(word, corpus) for word in document]
    # Join the corrected words back into a single string
    document = ' '.join(document)
    
    # Append the corrected sentence to the documents list
    documents.append(document)

df_test[f"input_{language}"] = pd.DataFrame({f"input_{language}": documents})

In [None]:
X_train, y_train = df_train[f"input_{language}"], df_train.Topology
X_test, y_test = df_test[f"input_{language}"], df_test.Topology

# Prepare the data
## transform text to set of words representation

In [None]:
def text_to_var(X_train, X_test, ngram_range):
    # Initialize the CountVectorizer with specified parameters
    vectorizer = CountVectorizer(max_features=500, 
                                 min_df=2, 
                                 max_df=0.8,
                                 ngram_range=ngram_range,                          
                                 stop_words=stopwords.words(language))
    
    # Fit the vectorizer on the training data and transform the training data into a document-term matrix
    X_train_vec = vectorizer.fit_transform(X_train).toarray()
    
    # Transform the test data into a document-term matrix using the fitted vectorizer
    X_test_vec = vectorizer.transform(X_test).toarray()
    
    return X_train_vec, X_test_vec

# Convert the text data into numerical vectors using the defined function
X_train_vec, X_test_vec = text_to_var(X_train, X_test, ngram_range)

## text to predict Topology

In [None]:
def plot_results_topology(y_test, y_pred):
        # Get the unique classes in the test set
        classes = np.unique(y_test)

        # Calculate and print Accuracy, Precision, Recall
        accuracy = metrics.accuracy_score(y_test, y_pred)
        print("Accuracy:", round(accuracy, 2))
        print("Detail:")
        print(metrics.classification_report(y_test, y_pred))

        # Plot confusion matrix
        cm = metrics.confusion_matrix(y_test, y_pred)
        fig, ax = plt.subplots()
        sns.heatmap(cm, annot=True, fmt='d', ax=ax, cmap=plt.cm.Blues, cbar=False)
        ax.set(xlabel="Pred", ylabel="True", xticklabels=classes, yticklabels=classes, title="Confusion matrix")
        plt.yticks(rotation=0)
        plt.show()

        # Calculate accuracy again (redundant, but kept for consistency)
        accuracy = metrics.accuracy_score(y_test, y_pred)
        
        # Calculate precision, recall, and F1 score for micro and macro averages
        precision_micro, recall_micro, f1_micro, _ = metrics.precision_recall_fscore_support(y_test, y_pred, average='micro')
        precision_macro, recall_macro, f1_macro, _ = metrics.precision_recall_fscore_support(y_test, y_pred, average='macro')
        
        # Return the calculated metrics
        return accuracy, precision_micro, recall_micro, f1_micro, precision_macro, recall_macro, f1_macro

### Random Forest

randomized search

In [None]:
pipeline = Pipeline(
    [
        ("clf", RandomForestClassifier(random_state=0)),
    ]
)

parameter_grid = {
    "clf__n_estimators": [10, 100, 1000],
    'clf__max_depth': [3, 5, 10, None],
    'clf__min_samples_split': [2, 5, 10],
    'clf__max_samples': [0.1, 0.5, 0.8, None],
    'clf__max_features': [1, 4, 7, 'sqrt']
}

k_fold_cv = 5 # Stratified 5-fold cross validation
n_iter = 100
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=parameter_grid,
    n_iter=n_iter,
    random_state=0,
    n_jobs=-1,
    verbose=1,
    cv=k_fold_cv,
    scoring=scoring,
    refit=refit 
)

random_search.fit(X_train_vec, y_train)
random_search.best_params_

fit model

In [None]:
classifier = RandomForestClassifier(n_estimators=random_search.best_params_["clf__n_estimators"],
                                    max_depth=random_search.best_params_["clf__max_depth"],
                                    min_samples_split=random_search.best_params_["clf__min_samples_split"],
                                    max_samples=random_search.best_params_["clf__max_samples"],
                                    max_features=random_search.best_params_["clf__max_features"],
                                    random_state=0)
classifier.fit(X_train_vec, y_train) 
y_pred = classifier.predict(X_test_vec)

accuracy_test, precision_micro_test, recall_micro_test, f1_micro_test, precision_macro_test, recall_macro_test, f1_macro_test = plot_results_topology(y_test, y_pred)

### Naive Bayes

In [None]:
pipeline = Pipeline(
    [
        ("clf", MultinomialNB()),
    ]
)

parameter_grid = {
    'clf__alpha': [1]
}

k_fold_cv = 5 # Stratified 5-fold cross validation
n_iter = 100
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=parameter_grid,
    n_iter=n_iter,
    random_state=0,
    n_jobs=-1,
    verbose=1,
    cv=k_fold_cv,
    scoring=scoring,
    refit=refit
)

random_search.fit(X_train_vec, y_train)
random_search.best_params_

fit model

In [None]:
classifier = MultinomialNB()
classifier.fit(X_train_vec, y_train) 
y_pred = classifier.predict(X_test_vec)

accuracy_test, precision_micro_test, recall_micro_test, f1_micro_test, precision_macro_test, recall_macro_test, f1_macro_test = plot_results_topology(y_test, y_pred)

### Logistic Regression

randomized search

In [None]:
pipeline = Pipeline(
    [
        ("clf", LogisticRegression(random_state=0)),
    ]
)

parameter_grid = {
    'clf__penalty': [None]
}

k_fold_cv = 5 # Stratified 5-fold cross validation
n_iter = 100
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=parameter_grid,
    n_iter=n_iter,
    random_state=0,
    n_jobs=-1,
    verbose=1,
    cv=k_fold_cv,
    scoring=scoring,
    refit=refit
)

random_search.fit(X_train_vec, y_train)
random_search.best_params_

fit model

In [None]:
classifier = LogisticRegression(random_state=0, penalty=None)
classifier.fit(X_train_vec, y_train) 
y_pred = classifier.predict(X_test_vec)

accuracy_test, precision_micro_test, recall_micro_test, f1_micro_test, precision_macro_test, recall_macro_test, f1_macro_test = plot_results_topology(y_test, y_pred)

### Support Vector Machines

randomized search

In [None]:
pipeline = Pipeline(
    [
        ("clf", SVC(random_state=0, gamma='scale')),
    ]
)

parameter_grid = {
    'clf__C': [1, 10, 100, 1000], 
    'clf__kernel': ['linear', 'rbf']
}

k_fold_cv = 5 # Stratified 5-fold cross validation
n_iter = 100
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=parameter_grid,
    n_iter=n_iter,
    random_state=0,
    n_jobs=-1,
    verbose=1,
    cv=k_fold_cv,
    scoring=scoring,
    refit=refit 
)

random_search.fit(X_train_vec, y_train)
random_search.best_params_

fit model

In [None]:
classifier = SVC(random_state=0, 
                 gamma='scale',
                 kernel=random_search.best_params_["clf__kernel"],
                 C=random_search.best_params_["clf__C"],
                 probability=True)
classifier.fit(X_train_vec, y_train) 
y_pred = classifier.predict(X_test_vec)

accuracy_test, precision_micro_test, recall_micro_test, f1_micro_test, precision_macro_test, recall_macro_test, f1_macro_test = plot_results_topology(y_test, y_pred)

### XGBoost

randomized search

In [None]:
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)

pipeline = Pipeline(
    [
        ("clf", xgb.XGBClassifier(random_state=0)),
    ]
)

parameter_grid = {
    "clf__colsample_bytree": uniform(0.7, 0.3),
    "clf__gamma": uniform(0, 0.5),
    "clf__learning_rate": uniform(0.03, 0.3), # default 0.1 
    "clf__max_depth": randint(2, 6), # default 3
    "clf__n_estimators": randint(100, 150), # default 100
    "clf__subsample": uniform(0.6, 0.4)
}

k_fold_cv = 5 # Stratified 5-fold cross validation
n_iter = 100
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=parameter_grid,
    n_iter=n_iter,
    random_state=0,
    n_jobs=-1,
    verbose=1,
    cv=k_fold_cv,
    scoring=scoring,
    refit=refit 
)

random_search.fit(X_train_vec, y_train_encoded)
random_search.best_params_

fit model

In [None]:
y_test_encoded = le.fit_transform(y_test)

classifier = xgb.XGBClassifier(random_state=0,
                 colsample_bytree=random_search.best_params_["clf__colsample_bytree"],
                 gamma=random_search.best_params_["clf__gamma"],
                 learning_rate=random_search.best_params_["clf__learning_rate"],
                 max_depth=random_search.best_params_["clf__max_depth"],
                 n_estimators=random_search.best_params_["clf__n_estimators"],
                 subsample=random_search.best_params_["clf__subsample"])
classifier.fit(X_train_vec, y_train_encoded) 
y_pred = classifier.predict(X_test_vec)

accuracy_test, precision_micro_test, recall_micro_test, f1_micro_test, precision_macro_test, recall_macro_test, f1_macro_test = plot_results_topology(y_test_encoded, y_pred)

### KNN

randomized search

In [None]:
pipeline = Pipeline(
    [
        ("clf", KNeighborsClassifier()),
    ]
)

parameter_grid = {
    'clf__n_neighbors': [1, 3, 5, 7], 
    'clf__weights': ['uniform', 'distance']
}

k_fold_cv = 5 # Stratified 5-fold cross validation
n_iter = 100
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=parameter_grid,
    n_iter=n_iter,
    random_state=0,
    n_jobs=-1,
    verbose=1,
    cv=k_fold_cv,
    scoring=scoring,
    refit=refit 
)

random_search.fit(X_train_vec, y_train)
random_search.best_params_

fit model

In [None]:
classifier = KNeighborsClassifier(
                 weights=random_search.best_params_["clf__weights"],
                 n_neighbors=random_search.best_params_["clf__n_neighbors"])
classifier.fit(X_train_vec, y_train) 
y_pred = classifier.predict(X_test_vec)

accuracy_test, precision_micro_test, recall_micro_test, f1_micro_test, precision_macro_test, recall_macro_test, f1_macro_test = plot_results_topology(y_test, y_pred)