<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [130]:
import sys
import numpy as np
import os
import pandas as pd
from tqdm.auto import tqdm
sys.path.append('../')
from src.clean_data import normalize_text
from sklearn.utils import class_weight
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [131]:
working_dir = os.getcwd()
data_path = os.path.dirname(working_dir) + '/data/'
df = pd.read_csv(data_path + 'preprocessed.csv')

In [132]:
df.head()

Unnamed: 0,complaint_description,assigned_division
0,Date last observed: 29-jun-20; time last ob...,Housing Inspection Services
1,Unpermitted interior framing at 3rd level atti...,Electrical Inspection Division
2,Elevator (in the parking area )to condos is in...,Housing Inspection Services
3,Complainant is concerned about the lenght of t...,Building Inspection Division
4,Date last observed: 06-jan-21; time last ob...,Building Inspection Division


In [133]:
#ensure there are no null values or duplicates
print(f"Null: \n {df.isna().sum()} \n")
print(f"Duplicates:  {df.duplicated().sum()}")

Null: 
 complaint_description    0
assigned_division        0
dtype: int64 

Duplicates:  0


In [134]:
df.shape

(183607, 2)

In [135]:
df.assigned_division.value_counts()

Housing Inspection Services       86793
Building Inspection Division      63267
Plumbing Inspection Division      16241
Code Enforcement Section          10680
Electrical Inspection Division     5515
Disabled Access Division           1111
Name: assigned_division, dtype: int64

Now that we have loaded our preprocessed data we're ready to prepare it for machine learning using the normalize_text function declared in our clean_data module. 

We will prepare a stemmed and lemmatized version of the data so that we can explore the change in performance from both approaches. Lemmatization takes significantly longer, but can yield improved accuracy. 

We are using tqdm.pandas to keep track of our functions with a progress bar. 

In [136]:
#create stemmed text
tqdm.pandas()
stemmed = df.copy()
stemmed['complaint_description'] = stemmed['complaint_description'].progress_apply(lambda x: normalize_text(x))
stemmed.to_csv(data_path + '/stemmed_text.csv')

  0%|          | 0/183607 [00:00<?, ?it/s]

In [137]:
#create lemmatized text
tqdm.pandas()
lemmatized = df.copy()
lemmatized['complaint_description'] = lemmatized['complaint_description'].progress_apply(lambda x: normalize_text(x, lemmatize = True))
lemmatized.to_csv(data_path + '/lemmatized_text.csv')

  0%|          | 0/183607 [00:00<?, ?it/s]

In [None]:
# #define tf tokenizer
# tokenizer = Tokenizer(num_words=5000, oov_token='<UNK>')

# #fit tokenizer to text
# tokenizer.fit_on_texts(lemmatized.complaint_description)

# #define variables for word count and index
# word_count = tokenizer.word_counts
# word_index = tokenizer.word_index

In [138]:
lemm = pd.read_csv(data_path + 'lemmatized_text.csv',index_col=[0])

In [139]:
lemm.head()

Unnamed: 0,complaint_description,assigned_division
0,"['date', 'last', 'observe', 'jun', 'time', 'la...",Housing Inspection Services
1,"['unpermitte', 'interior', 'frame', 'rd', 'lev...",Electrical Inspection Division
2,"['elevator', 'parking', 'area', 'condo', 'inop...",Housing Inspection Services
3,"['complainant', 'concerned', 'lenght', 'time',...",Building Inspection Division
4,"['date', 'last', 'observe', 'jan', 'time', 'la...",Building Inspection Division


In [140]:
print(lemm.isna().sum())
print("Duplicated :", lemm.duplicated().sum())

complaint_description    0
assigned_division        0
dtype: int64
Duplicated : 15284


In [51]:
duplicated_lemm = lemm[lemm.duplicated() == True]

In [141]:
lemm.assigned_division.value_counts()

Housing Inspection Services       86793
Building Inspection Division      63267
Plumbing Inspection Division      16241
Code Enforcement Section          10680
Electrical Inspection Division     5515
Disabled Access Division           1111
Name: assigned_division, dtype: int64

In [52]:
duplicated_lemm.assigned_division.value_counts()

Plumbing Inspection Division      5610
Housing Inspection Services       5295
Building Inspection Division      2312
Code Enforcement Section          1943
Electrical Inspection Division      98
Disabled Access Division            26
Help Desk / Technical Services       4
Name: assigned_division, dtype: int64

In [142]:
y_train = lemm.assigned_division
class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(y_train),
                                                 y_train)
# class_weight_dict = dict(enumerate(class_weight,1))|

 'Disabled Access Division' 'Electrical Inspection Division'
 'Housing Inspection Services' 'Plumbing Inspection Division'], y=0            Housing Inspection Services
1         Electrical Inspection Division
2            Housing Inspection Services
3           Building Inspection Division
4           Building Inspection Division
                       ...              
183602      Plumbing Inspection Division
183603       Housing Inspection Services
183604      Building Inspection Division
183605       Housing Inspection Services
183606      Plumbing Inspection Division
Name: assigned_division, Length: 183607, dtype: object as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error


In [143]:
print(class_weights)

[ 0.48368291  2.86527778 27.54380438  5.54871562  0.35257644  1.88419227]
