In [4]:
# import libraries
try:
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    from matplotlib.pyplot import figure
    import string
    import texthero as hero
    from texthero import preprocessing
    import nltk
    from nltk.corpus import stopwords
    from nltk.stem import WordNetLemmatizer
    from sklearn.preprocessing import LabelEncoder
    from sklearn.metrics import accuracy_score
    from sklearn.model_selection import train_test_split
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.pipeline import Pipeline
    from imblearn.over_sampling import SMOTE

    from sklearn.neural_network import MLPClassifier
    from sklearn.feature_selection import chi2
    from nltk.corpus import wordnet
    from nltk.stem import WordNetLemmatizer

    import warnings
except(ImportError):
    print(f'Import Error: {ImportError}')

# ignore warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

# set seeds for reproducability
from numpy.random import seed
seed(500)

# global configurations
pd.set_option("display.max_colwidth", -1) # show larger text in pandas dataframe

In [5]:
# Text Cleaning and Pre-processing
def preprocess_text(features):
    # cleaning steps
    cleaning_pipeline = [
        preprocessing.fillna,
        preprocessing.lowercase,
        preprocessing.remove_whitespace,
        preprocessing.remove_punctuation,
        preprocessing.remove_urls,
        preprocessing.remove_brackets,
        preprocessing.remove_stopwords,
        preprocessing.remove_digits,
        preprocessing.remove_angle_brackets,
        preprocessing.remove_curly_brackets
    ]

    # apply pipeline to text
    clean_text = features.pipe(hero.clean, cleaning_pipeline)
    
    return clean_text

##Experiements##

In [8]:
# MLP

# read csv
df = pd.read_csv('processed_dataset_chi2.csv')
df.columns

target = df[['Category']]

df.drop(['Category'], axis=1, inplace=True)

columns = df.columns

total_cols = len(columns)
accuracy_df = pd.DataFrame()

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2))

# uncomment out these 2 lines for bulk process
# total_cols = len(df)
# for num_of_cols in range(total_cols):

total_cols = 10 # try 1, 2, 3 ....10
for num_of_cols in range(total_cols-1):
    df['description'] = ''
    col_names = ''
    for i in range(num_of_cols+1):
        col_names = col_names + ' ' + columns[i]
        df['description'] = df['description'] + ' ' + df[columns[i]]
    
    clean_text = preprocess_text(df['description']) 

    clean_text = tfidf.fit_transform(clean_text).toarray()

    # split data
    x_train, x_test, y_train, y_test = train_test_split(clean_text, target, random_state=0, test_size=0.25, shuffle=True)

    # balance the data - optional
    #oversample = SMOTE(random_state=0,n_jobs=-1,k_neighbors=5)
    #x_train, y_train = oversample.fit_resample(x_train, y_train)
    
    # classification - MLP
    mlp = MLPClassifier(random_state=0
                        )
    mlp.fit(x_train, y_train)
    y_pred = mlp.predict(x_test)
    mlp_accuracy = accuracy_score(y_test, y_pred)

    new_row = {'col_names':col_names, 'mlp':mlp_accuracy}

    print (f'{new_row["col_names"]} : {new_row["mlp"]}')

 EventDescription : 0.7467652495378928
 EventDescription ActionTaken : 0.7806531115218731
 EventDescription ActionTaken IncidentCause : 0.8250154035736291
 EventDescription ActionTaken IncidentCause IncidentConsequence : 0.8311768330252619
 EventDescription ActionTaken IncidentCause IncidentConsequence CauseCommunity : 0.8299445471349353
 EventDescription ActionTaken IncidentCause IncidentConsequence CauseCommunity FailedAssets : 0.8354898336414048
 EventDescription ActionTaken IncidentCause IncidentConsequence CauseCommunity FailedAssets CauseWorkP : 0.8404189772027111
 EventDescription ActionTaken IncidentCause IncidentConsequence CauseCommunity FailedAssets CauseWorkP CauseTechnical : 0.8404189772027111
 EventDescription ActionTaken IncidentCause IncidentConsequence CauseCommunity FailedAssets CauseWorkP CauseTechnical CauseEnvironment : 0.8373382624768947
