In [4]:
# import libraries
try:
    !pip install texthero
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    from matplotlib.pyplot import figure
    import texthero as hero
    from texthero import preprocessing
 
    from sklearn.preprocessing import LabelEncoder
    from sklearn.metrics import accuracy_score
    from sklearn.model_selection import train_test_split
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.pipeline import Pipeline
    from sklearn.ensemble import RandomForestClassifier
    from imblearn.over_sampling import SMOTE
    from sklearn.naive_bayes import MultinomialNB
    from xgboost import XGBClassifier
 
    from sklearn.linear_model import SGDClassifier
    from sklearn.svm import SVC
    from sklearn.ensemble import RandomForestClassifier
 
    from sklearn.feature_selection import chi2, SelectKBest
 
 
    import warnings
except(ImportError):
    print(f'Import Error: {ImportError}')
 
# ignore warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)
 
# set seeds for reproducability
from numpy.random import seed
seed(500)
 
# global configurations
pd.set_option("display.max_colwidth", -1) # show larger text in pandas dataframe

Collecting texthero
  Downloading https://files.pythonhosted.org/packages/1f/5a/a9d33b799fe53011de79d140ad6d86c440a2da1ae8a7b24e851ee2f8bde8/texthero-1.0.9-py3-none-any.whl
Collecting unidecode>=1.1.1
[?25l  Downloading https://files.pythonhosted.org/packages/74/65/91eab655041e9e92f948cb7302e54962035762ce7b518272ed9d6b269e93/Unidecode-1.1.2-py2.py3-none-any.whl (239kB)
[K     |████████████████████████████████| 245kB 7.6MB/s 
Collecting nltk>=3.3
[?25l  Downloading https://files.pythonhosted.org/packages/92/75/ce35194d8e3022203cca0d2f896dbb88689f9b3fce8e9f9cff942913519d/nltk-3.5.zip (1.4MB)
[K     |████████████████████████████████| 1.4MB 11.2MB/s 
Building wheels for collected packages: nltk
  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Created wheel for nltk: filename=nltk-3.5-cp36-none-any.whl size=1434676 sha256=a579de834dfc2f560462b9921ae14cecd831902ad395de27092bd607a52761f0
  Stored in directory: /root/.cache/pip/wheels/ae/8c/3f/b1fe0ba04555b08b57ab52ab7f86023639a

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.

The module is deprecated in version 0.21 and will be removed in version 0.23 since we've dropped support for Python 2.7. Please rely on the official version of six (https://pypi.org/project/six/).


The sklearn.neighbors.base module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.neighbors. Anything that cannot be imported from sklearn.neighbors is now part of the private API.



In [5]:
# Text Cleaning and Pre-processing
def preprocess_text(features):
    # cleaning steps
    cleaning_pipeline = [
        preprocessing.fillna,
        preprocessing.lowercase,
        preprocessing.remove_whitespace,
        preprocessing.remove_punctuation,
        preprocessing.remove_urls,
        preprocessing.remove_brackets,
        preprocessing.remove_stopwords,
        preprocessing.remove_digits,
        preprocessing.remove_angle_brackets,
        preprocessing.remove_curly_brackets
    ]

    # apply pipeline to text
    clean_text = features.pipe(hero.clean, cleaning_pipeline)
    
    return clean_text

In [8]:
# read csv
df = pd.read_csv('processed_dataset_chi2.csv')
df.columns

target = df[['Category']]

df.drop(['Category'], axis=1, inplace=True)

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2))

In [9]:

columns = df.columns

total_cols = len(columns)
accuracy_df = pd.DataFrame()

# uncomment out these 2 lines for bulk process
# total_cols = len(df)
# for num_of_cols in range(total_cols):

total_cols = 10 # try 1, 2, 3 ....10
for num_of_cols in range(total_cols-1, total_cols):
    df['description'] = ''
    col_names = ''
    for i in range(num_of_cols+1):
        col_names = col_names + ' ' + columns[i]
        df['description'] = df['description'] + ' ' + df[columns[i]]
    
    clean_text = preprocess_text(df['description']) 

    clean_text = tfidf.fit_transform(clean_text).toarray()

    # split data
    x_train, x_test, y_train, y_test = train_test_split(clean_text, target, random_state=0, test_size=0.25, shuffle=True)

    # balance the data - optional
    oversample = SMOTE(random_state=0,n_jobs=-1,k_neighbors=5)
    x_train, y_train = oversample.fit_resample(x_train, y_train)
    
    # classification - XGBoost
    xgb = XGBClassifier(alpha=0.1)
    xgb.fit(x_train, y_train)
    y_pred = xgb.predict(x_test)
    xgb_accuracy = accuracy_score(y_test, y_pred)

    new_row = {'col_names':col_names, 'xgb':xgb_accuracy}
    accuracy_df = accuracy_df.append(new_row, ignore_index=True)

    print(accuracy_df)

    # save results to csv
    accuracy_df.to_csv('accuracy_ML_chi2.csv', index=False)

KeyboardInterrupt: ignored

In [None]:

columns = df.columns

total_cols = len(columns)
accuracy_df = pd.DataFrame()

# uncomment out these 2 lines for bulk process

for num_of_cols in range(total_cols):

#total_cols = 3 # try 1, 2, 3 ....10
#for num_of_cols in range(total_cols-1, total_cols):
    df['description'] = ''
    col_names = ''
    for i in range(num_of_cols+1):
        col_names = col_names + ' ' + columns[i]
        df['description'] = df['description'] + ' ' + df[columns[i]]
    clean_text = preprocess_text(df['description']) 

    clean_text = tfidf.fit_transform(clean_text).toarray()

    # split data
    x_train, x_test, y_train, y_test = train_test_split(clean_text, target, random_state=0, test_size=0.25, shuffle=True)

    # balance the data - optional
    oversample = SMOTE(random_state=0,n_jobs=-1,k_neighbors=5)
    x_train, y_train = oversample.fit_resample(x_train, y_train)
    
   # classification - XGBoost
    xgb = XGBClassifier(alpha=0.1)
    xgb.fit(x_train, y_train)
    y_pred = xgb.predict(x_test)
    xgb_accuracy = accuracy_score(y_test, y_pred)

    new_row = {'col_names':col_names, 'xgb':xgb_accuracy}
    accuracy_df = accuracy_df.append(new_row, ignore_index=True)


    

    print(accuracy_df)

# save results to csv
accuracy_df.to_csv('accuracy_ML_chi2.csv', index=False)

           col_names       xgb
0   EventDescription  0.745533
                       col_names       xgb
0   EventDescription              0.745533
1   EventDescription ActionTaken  0.778805
                                     col_names       xgb
0   EventDescription                            0.745533
1   EventDescription ActionTaken                0.778805
2   EventDescription ActionTaken IncidentCause  0.817006
                                                         col_names       xgb
0   EventDescription                                                0.745533
1   EventDescription ActionTaken                                    0.778805
2   EventDescription ActionTaken IncidentCause                      0.817006
3   EventDescription ActionTaken IncidentCause IncidentConsequence  0.819470
                                                                        col_names       xgb
0   EventDescription                                                               0.745533
1   EventDes