In [1]:
# import libraries
try:
    import pandas as pd
    import numpy as np
    from collections import OrderedDict
    import matplotlib.pyplot as plt
    from matplotlib.pyplot import figure
    import string
    import texthero as hero
    from texthero import preprocessing
    import nltk
    from sklearn.preprocessing import LabelEncoder
    from sklearn.metrics import accuracy_score
    from sklearn.model_selection import train_test_split
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.pipeline import Pipeline
    from sklearn.linear_model import SGDClassifier, LogisticRegression
    from sklearn.feature_selection import chi2, SelectKBest

    from nltk.corpus import wordnet
    from nltk.stem import WordNetLemmatizer

    import warnings
except(ImportError):
    print(f'Import Error: {ImportError}')

# ignore warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

# set seeds for reproducability
from numpy.random import seed
seed(500)

# global configurations
pd.set_option("display.max_colwidth", -1) # show larger text in pandas dataframe

In [2]:
# read csv
try:
    df = pd.read_csv('../../cleaned_incidents1.csv')
except:
    df = pd.read_csv("/Users/pradeep/Desktop/ProjectANotebooks/notebooks/cleaned_incidents1.csv")

# drop missing category
df = df.dropna(axis=0, subset=['Category'])

# factorize category
df['category_id'] = df['Category'].factorize()[0]

In [3]:
df.columns

Index(['ActionTaken', 'Address', 'AssetLabel', 'CauseCommunity',
       'CauseEnvironment', 'CausePre', 'CauseTechnical', 'CauseWorkP',
       'ContactType', 'CorrectProtection', 'EventDescription', 'FailedAssets',
       'FailedExplosion', 'FailedOilFilled', 'FailedOtherAssets',
       'FailedOtherAssetsOther', 'FeederNumber', 'IncidentCause',
       'IncidentConsequence', 'IncidentDatetime',
       'IncidentFireFFactorReportable', 'IncidentFireSeverity', 'IncidentID',
       'IncidentLocationType', 'IncidentLocationTypeOther', 'IncidentNumber',
       'IncidentType', 'Lat', 'Long', 'MadeSafe', 'NetworkType', 'Status',
       'SubmissionID', 'SubmittedDateTimeString', 'Voltage', 'WeatherStation',
       'Postcode', 'Locality', 'Category', 'category_id'],
      dtype='object')

In [4]:
# Check Correlation non string columns with category, 
# we only use moderate columns that have at least moderate relationship with category

# get non object types columns
df_non_objects = df.select_dtypes(exclude='object')
print(df_non_objects.dtypes)

# replace with nan with 0, otherwise correlation test won't work
df_non_objects = df_non_objects.replace(np.nan, 0, regex=True)
df_non_objects.isna().sum()

# check correlation of non object columns with category
df_non_objects.corr()
# result: only IncidentFireFFactorReportable has moderate correlation, the rest are weak, we are going to ignore those columns


CorrectProtection                float64
FailedExplosion                  int64  
FailedOilFilled                  int64  
FailedOtherAssets                int64  
IncidentFireFFactorReportable    float64
IncidentID                       int64  
Lat                              float64
Long                             float64
MadeSafe                         int64  
SubmissionID                     int64  
Postcode                         float64
category_id                      int64  
dtype: object


Unnamed: 0,CorrectProtection,FailedExplosion,FailedOilFilled,FailedOtherAssets,IncidentFireFFactorReportable,IncidentID,Lat,Long,MadeSafe,SubmissionID,Postcode,category_id
CorrectProtection,1.0,-0.342556,-0.337414,-0.341863,-0.148627,-0.416294,-0.121339,0.014758,-0.081096,-0.400821,-0.102839,-0.077354
FailedExplosion,-0.342556,1.0,0.981676,0.990239,-0.060582,0.215122,0.221316,-0.149428,-0.130761,0.186195,-0.119462,-0.095899
FailedOilFilled,-0.337414,0.981676,1.0,0.976934,-0.061994,0.209143,0.216609,-0.14509,-0.1288,0.181922,-0.123442,-0.094374
FailedOtherAssets,-0.341863,0.990239,0.976934,1.0,-0.05969,0.211598,0.222168,-0.149923,-0.129561,0.183428,-0.11824,-0.100103
IncidentFireFFactorReportable,-0.148627,-0.060582,-0.061994,-0.05969,1.0,0.040662,0.145878,-0.004229,0.039426,0.063232,0.211112,0.400701
IncidentID,-0.416294,0.215122,0.209143,0.211598,0.040662,1.0,0.024937,-0.011049,0.086841,0.974835,-0.028798,-0.021569
Lat,-0.121339,0.221316,0.216609,0.222168,0.145878,0.024937,1.0,-0.140321,-0.04483,0.019019,0.173324,0.126489
Long,0.014758,-0.149428,-0.14509,-0.149923,-0.004229,-0.011049,-0.140321,1.0,0.074967,-0.004007,0.077918,-6.8e-05
MadeSafe,-0.081096,-0.130761,-0.1288,-0.129561,0.039426,0.086841,-0.04483,0.074967,1.0,0.099889,0.111762,0.050404
SubmissionID,-0.400821,0.186195,0.181922,0.183428,0.063232,0.974835,0.019019,-0.004007,0.099889,1.0,-0.018241,-0.010711


In [5]:
# Now process columns with object types
df_objects = df.select_dtypes('object')

# Eliminate Address as they can represented by Locality
df_objects = df_objects.drop(['Address'], axis=1)

# replace missing values
df_objects['CauseCommunity'] = df['CauseCommunity'].fillna('Unknown external/community factor')
df_objects['CauseEnvironment'] = df['CauseEnvironment'].fillna('Unknown environment factor')
df_objects['CauseTechnical'] = df['CauseTechnical'].fillna('Unknown technical factor')
df_objects['CauseWorkP'] = df['CauseTechnical'].fillna('Unknown work practice factor')

# replace the rest with empty string
df_objects = df_objects.replace(np.nan, '', regex=True)

# separate target
target = df_objects['Category']
df_objects = df_objects.drop(['Category'], axis=1)
print('object type columns:', df_objects.columns)

object type columns: Index(['ActionTaken', 'AssetLabel', 'CauseCommunity', 'CauseEnvironment',
       'CausePre', 'CauseTechnical', 'CauseWorkP', 'ContactType',
       'EventDescription', 'FailedAssets', 'FailedOtherAssetsOther',
       'FeederNumber', 'IncidentCause', 'IncidentConsequence',
       'IncidentDatetime', 'IncidentFireSeverity', 'IncidentLocationType',
       'IncidentLocationTypeOther', 'IncidentNumber', 'IncidentType',
       'NetworkType', 'Status', 'SubmittedDateTimeString', 'Voltage',
       'WeatherStation', 'Locality'],
      dtype='object')


In [6]:
# Text Cleaning and Pre-processing
def preprocess_text(features):
    # cleaning steps
    cleaning_pipeline = [
        preprocessing.fillna,
        preprocessing.lowercase,
        preprocessing.remove_whitespace,
        preprocessing.remove_punctuation,
        preprocessing.remove_urls,
        preprocessing.remove_brackets,
        preprocessing.remove_stopwords,
        preprocessing.remove_digits,
        preprocessing.remove_angle_brackets,
        preprocessing.remove_curly_brackets
    ]

    # apply pipeline to text
    clean_text = features.pipe(hero.clean, cleaning_pipeline)
    
    return clean_text

In [7]:
# Lemmatize with POS Tag
lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def get_lematizer(sentence):
    clean_text =  (" ".join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(sentence) if w not in string.punctuation]))
    return clean_text


In [8]:
def combine_text_columns(df_source):
    description = ''
    for column in df_objects:
        description = df_objects['description'] + ' ' + df_objects[column]

In [9]:
# for test only
get_lematizer('light lighting lighten run running work worked working')

'light light lighten run run work work work'

In [None]:
# Build features from columns
df_objects['description'] = ''
for column in df_objects:
    df_objects['description'] = df_objects['description'] + ' ' + df_objects[column]
        
# clean the data
clean_text = preprocess_text(df_objects['description']) # feature extraction/vectorize
clean_text = clean_text.apply(lambda sentence: get_lematizer(sentence))

# preparation for feature selection
category_id_df = df[['Category', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)

# vectorize
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2))

tfidf_features = tfidf.fit_transform(clean_text).toarray()
labels = df.category_id
clean_text = tfidf_features

print('finish')

In [None]:
# show which terms related to each category
N = 50
keywords = []
for Category, category_id in sorted(category_to_id.items()):
  features_chi2 = chi2(tfidf_features, labels == category_id)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(tfidf.get_feature_names())[indices]
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
  keywords_perCategory = unigrams[-N:]
  print("# '{}':".format(Category))
  print(keywords_perCategory)
  keywords = keywords + keywords_perCategory

In [None]:
def count_keyWords_inColumn(clean_text_column):
    cnt = 0
    length = len(clean_text_column.index)
    for i in range(length) : 
        words = clean_text_column.iloc[i].split()
        for word in words:
            if word in keywords:
                cnt += 1
    return cnt

In [None]:
# Find how many important words in each column
count_df = df_objects.loc[:, df_objects.columns != 'description']
counter_dict = {}
for column in count_df:
    clean_text_column = preprocess_text(count_df[column]) 
    cnt = count_keyWords_inColumn(clean_text_column)
    counter_dict[column] = cnt
print(counter_dict)

##Experiements##

In [None]:
#counter_dict = OrderedDict(sorted(counter_dict.items(), key=lambda x: x[1], reverse=True))
counter_dict = sorted(counter_dict.items(), key=lambda x: x[1], reverse=True)
print(counter_dict)

In [None]:


x= [i[0] for i in counter_dict[::-1]]
y = [i[1] for i in counter_dict[::-1]]


# plot the graph
import matplotlib.pyplot as plt
plt.figure(figsize=(80,30))
plt.style.use('dark_background')
plt.title('Feature Selection: Chi2', fontsize=80, fontweight='bold', color='pink')
plt.barh( x, y, color='pink')
plt.xlabel('Number of relevant words', fontsize=50, fontweight='bold', color='pink')
plt.ylabel('Features', fontsize=50, fontweight='bold', color='pink')
plt.tick_params(axis='y', labelsize=28)
plt.tick_params(axis='x', labelsize=28)

plt.yticks(rotation=45)
plt.savefig('chi2.png', transparent=True)
plt.show()

In [None]:
# based on above graph, get top 10 columns
columns = list(counter_dict.keys())[:10]
print(columns)

In [None]:
# save selected columns
selected_df = df_objects[columns] 
selected_df['Category'] = target
print(selected_df.columns)
selected_df.to_csv('processed_dataset_chi2.csv', index=False)

In [None]:
total_cols = len(columns)
accuracy_df = pd.DataFrame(columns = ['col_names','lr','sgd'])
#print(total_cols)
for num_of_cols in range(total_cols):
    selected_df['description'] = ''
    #print('num_of_cols:', num_of_cols)
    col_names = ''
    for i in range(num_of_cols+1):
        col_names = col_names + ' ' + columns[i]
        selected_df['description'] = selected_df['description'] + ' ' + selected_df[columns[i]]
    
    # clean the data
    clean_text = preprocess_text(selected_df['description']) 

    clean_text = tfidf.transform(clean_text).toarray()

    # split data
    x_train, x_test, y_train, y_test = train_test_split(clean_text, target, random_state=0, test_size=0.25, shuffle=True)

    #classification - Logistic regression
    lr = LogisticRegression(n_jobs=1, C=1e5)
    lr.fit(x_train ,y_train)
    y_pred = lr.predict(x_test)
    lr_accuracy = accuracy_score(y_test, y_pred)

    #classification - SGD
    sgd = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)
    sgd.fit(x_train, y_train)
    y_pred = sgd.predict(x_test)
    sgd_accuracy = accuracy_score(y_test, y_pred)
    
    new_row = {'col_names':col_names, 'lr':lr_accuracy, 'sgd':sgd_accuracy}
    accuracy_df = accuracy_df.append(new_row, ignore_index=True)

print(accuracy_df)

In [None]:
# save results to csv
accuracy_df.to_csv('accuracy_chi2.csv', index=False)

In [None]:
# Visualise accuracy

# get from ds if no more in cache
accuracy_df = pd.read_csv('accuracy_df.csv')

figure(num=None, figsize=(10, 6), dpi=80, facecolor='w', edgecolor='k')

x = accuracy_df[['col_id']] + 1
y1 = accuracy_df[['lr']]
plt.plot(x, y1, label = "Logistic Regression")

y2 = accuracy_df[['sgd']]
plt.plot(x, y2, label = "Stochastic Gradient Descent")

plt.xlabel('Number of features')
plt.ylabel('Accuracy')
plt.legend()
plt.xticks(x.iloc[:,0])

plt.show()

