In [7]:
# import libraries
try:
    !pip install texthero
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    from matplotlib.pyplot import figure
    import texthero as hero
    from texthero import preprocessing

    from sklearn.preprocessing import LabelEncoder
    from sklearn.metrics import accuracy_score
    from sklearn.model_selection import train_test_split
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.pipeline import Pipeline
    from sklearn.ensemble import RandomForestClassifier
    from imblearn.over_sampling import SMOTE
    from sklearn.naive_bayes import MultinomialNB

    from sklearn.linear_model import SGDClassifier
    from sklearn.svm import SVC
    from sklearn.ensemble import RandomForestClassifier

    from sklearn.feature_selection import chi2, SelectKBest


    import warnings
except(ImportError):
    print(f'Import Error: {ImportError}')

# ignore warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

# set seeds for reproducability
from numpy.random import seed
seed(500)

# global configurations
pd.set_option("display.max_colwidth", -1) # show larger text in pandas dataframe



In [9]:
# Text Cleaning and Pre-processing
def preprocess_text(features):
    # cleaning steps
    cleaning_pipeline = [
        preprocessing.fillna,
        preprocessing.lowercase,
        preprocessing.remove_whitespace,
        preprocessing.remove_punctuation,
        preprocessing.remove_urls,
        preprocessing.remove_brackets,
        preprocessing.remove_stopwords,
        preprocessing.remove_digits,
        preprocessing.remove_angle_brackets,
        preprocessing.remove_curly_brackets
    ]

    # apply pipeline to text
    clean_text = features.pipe(hero.clean, cleaning_pipeline)
    
    return clean_text

In [10]:
# read csv
df = pd.read_csv('/processed_dataset_chi2.csv')
df.columns

target = df[['Category']]

df.drop(['Category'], axis=1, inplace=True)

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2))

In [13]:
# SGD, SVM, RF
columns = df.columns

total_cols = len(columns)
accuracy_df = pd.DataFrame()

# uncomment out these 2 lines for bulk process
# total_cols = len(df)
# for num_of_cols in range(total_cols):

total_cols = 10 # try 1, 2, 3 ....10
for num_of_cols in range(total_cols-1, total_cols):
    df['description'] = ''
    col_names = ''
    for i in range(num_of_cols+1):
        col_names = col_names + ' ' + columns[i]
        df['description'] = df['description'] + ' ' + df[columns[i]]
    
    clean_text = preprocess_text(df['description']) 

    clean_text = tfidf.fit_transform(clean_text).toarray()

    # split data
    x_train, x_test, y_train, y_test = train_test_split(clean_text, target, random_state=0, test_size=0.25, shuffle=True)

    # balance the data - optional
    oversample = SMOTE(random_state=0,n_jobs=-1,k_neighbors=5)
    x_train, y_train = oversample.fit_resample(x_train, y_train)
    
    # classification - SGD
    nb = MultinomialNB (alpha=0.1)
    nb.fit(x_train, y_train)
    y_pred = nb.predict(x_test)
    nb_accuracy = accuracy_score(y_test, y_pred)

    new_row = {'col_names':col_names, 'nb':nb_accuracy}
    accuracy_df = accuracy_df.append(new_row, ignore_index=True)

    print(accuracy_df)

    # save results to csv
    accuracy_df.to_csv('accuracy_ML_chi2.csv', index=False)

                                                                                                                                                     col_names        nb
0   EventDescription ActionTaken IncidentCause IncidentConsequence CauseCommunity FailedAssets CauseWorkP CauseTechnical CauseEnvironment IncidentFireSeverity  0.338879


In [14]:
# SGD, SVM, RF
columns = df.columns

total_cols = len(columns)
accuracy_df = pd.DataFrame()

# uncomment out these 2 lines for bulk process

for num_of_cols in range(total_cols):

#total_cols = 3 # try 1, 2, 3 ....10
#for num_of_cols in range(total_cols-1, total_cols):
    df['description'] = ''
    col_names = ''
    for i in range(num_of_cols+1):
        col_names = col_names + ' ' + columns[i]
        df['description'] = df['description'] + ' ' + df[columns[i]]
    clean_text = preprocess_text(df['description']) 

    clean_text = tfidf.fit_transform(clean_text).toarray()

    # split data
    x_train, x_test, y_train, y_test = train_test_split(clean_text, target, random_state=0, test_size=0.25, shuffle=True)

    # balance the data - optional
    oversample = SMOTE(random_state=0,n_jobs=-1,k_neighbors=5)
    x_train, y_train = oversample.fit_resample(x_train, y_train)
    
   # classification - SGD
    nb = MultinomialNB (alpha=0.1)
    nb.fit(x_train, y_train)
    y_pred = nb.predict(x_test)
    nb_accuracy = accuracy_score(y_test, y_pred)

    new_row = {'col_names':col_names, 'nb':nb_accuracy}
    accuracy_df = accuracy_df.append(new_row, ignore_index=True)


    

    print(accuracy_df)

# save results to csv
accuracy_df.to_csv('accuracy_ML_chi2.csv', index=False)

           col_names        nb
0   EventDescription  0.697474
                       col_names        nb
0   EventDescription              0.697474
1   EventDescription ActionTaken  0.712261
                                     col_names        nb
0   EventDescription                            0.697474
1   EventDescription ActionTaken                0.712261
2   EventDescription ActionTaken IncidentCause  0.753543
                                                         col_names        nb
0   EventDescription                                                0.697474
1   EventDescription ActionTaken                                    0.712261
2   EventDescription ActionTaken IncidentCause                      0.753543
3   EventDescription ActionTaken IncidentCause IncidentConsequence  0.751694
                                                                        col_names        nb
0   EventDescription                                                               0.697474
1   EventDes

The set of 9 features is giving the highest accuracy of 77 and least is with 11 feature i.e, 33.