# Yu-Ting Shen

# RiskGenius Challenge Project

In [147]:
import pandas as pd

df_insurance_terms = pd.read_csv('terms.csv')
df_insurance_terms = df_insurance_terms[['term', 'text']]

In [148]:
df_insurance_terms.head()

Unnamed: 0,term,text
0,automatic premium loan,An optional provision in life insurance that a...
1,Household Goods Transportation Act of 1980,Provided a nonjudicial dispute settlement prog...
2,hydrocarbons,A class of organic compounds composed only of ...
3,hydraulic fracturing (fracking),A process in which fractures in hard-to-reach ...
4,hybrid plans,Risk financing techniques that are a combinati...


In [149]:
df_insurance_terms.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3261 entries, 0 to 3260
Data columns (total 2 columns):
term    3261 non-null object
text    3261 non-null object
dtypes: object(2)
memory usage: 51.0+ KB


Define a function to lower case, tokenize words, remove stop words and punctuations

In [150]:
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

def normalize_string(string):
    # Set to lower case
    string = string.lower()
    
    # Remove hyphen
    # Some words contain a hyphen
    string = string.replace('-', ' ')
    
    # Tokenization
    tokenized_words = word_tokenize(string)
    
    # Remove stopwords
    stopwords_removed = [word for word in tokenized_words if word not in stop_words]
    
    # Remove punctuations
#     alphabet_string = [word for word in stopwords_removed if word.isalpha()]
    alphabet_string = [word for word in stopwords_removed if word.isalnum()]
    
    # Return joined strings
    return ' '.join(alphabet_string)

In [151]:
df_insurance_terms.loc[:, 'new_term'] = df_insurance_terms.apply(lambda row: normalize_string(row['term']), axis=1)
df_insurance_terms.loc[:, 'new_text'] = df_insurance_terms.apply(lambda row: normalize_string(row['text']), axis=1)

In [152]:
df_insurance_terms.head()

Unnamed: 0,term,text,new_term,new_text
0,automatic premium loan,An optional provision in life insurance that a...,automatic premium loan,optional provision life insurance authorizes i...
1,Household Goods Transportation Act of 1980,Provided a nonjudicial dispute settlement prog...,household goods transportation act 1980,provided nonjudicial dispute settlement progra...
2,hydrocarbons,A class of organic compounds composed only of ...,hydrocarbons,class organic compounds composed carbon hydrog...
3,hydraulic fracturing (fracking),A process in which fractures in hard-to-reach ...,hydraulic fracturing fracking,process fractures hard reach shale rock format...
4,hybrid plans,Risk financing techniques that are a combinati...,hybrid plans,risk financing techniques combination retentio...


In [153]:
df_insurance_terms.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3261 entries, 0 to 3260
Data columns (total 4 columns):
term        3261 non-null object
text        3261 non-null object
new_term    3261 non-null object
new_text    3261 non-null object
dtypes: object(4)
memory usage: 102.0+ KB


Categorize group into 26 groups by the first character of term

In [154]:
df_insurance_terms['alphabet_group'] = df_insurance_terms['new_term'].str[0]

In [155]:
df_insurance_terms.head()

Unnamed: 0,term,text,new_term,new_text,alphabet_group
0,automatic premium loan,An optional provision in life insurance that a...,automatic premium loan,optional provision life insurance authorizes i...,a
1,Household Goods Transportation Act of 1980,Provided a nonjudicial dispute settlement prog...,household goods transportation act 1980,provided nonjudicial dispute settlement progra...,h
2,hydrocarbons,A class of organic compounds composed only of ...,hydrocarbons,class organic compounds composed carbon hydrog...,h
3,hydraulic fracturing (fracking),A process in which fractures in hard-to-reach ...,hydraulic fracturing fracking,process fractures hard reach shale rock format...,h
4,hybrid plans,Risk financing techniques that are a combinati...,hybrid plans,risk financing techniques combination retentio...,h


In [156]:
df_insurance_terms.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3261 entries, 0 to 3260
Data columns (total 5 columns):
term              3261 non-null object
text              3261 non-null object
new_term          3261 non-null object
new_text          3261 non-null object
alphabet_group    3261 non-null object
dtypes: object(5)
memory usage: 127.5+ KB


In [157]:
abc = [i for i in 'abcdefghijklmnopqrstuvwxyz']
num = [i for i in range(1, 27)]
abc_to_num = dict(zip(abc, num))
# print(abc_to_num)

df_insurance_terms['group'] = df_insurance_terms['alphabet_group'].map(abc_to_num)

In [158]:
df_insurance_terms.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3261 entries, 0 to 3260
Data columns (total 6 columns):
term              3261 non-null object
text              3261 non-null object
new_term          3261 non-null object
new_text          3261 non-null object
alphabet_group    3261 non-null object
group             3257 non-null float64
dtypes: float64(1), object(5)
memory usage: 152.9+ KB


In [159]:
df_insurance_terms[df_insurance_terms['group'].isnull()]

Unnamed: 0,term,text,new_term,new_text,alphabet_group,group
801,401(k) plan,The most common type of defined contribution r...,401 k plan,common type defined contribution retirement pl...,4,
802,401(k) fee claims,Claims alleging that the individuals responsib...,401 k fee claims,claims alleging individuals responsible admini...,4,
1873,831(b) captive,A captive that is taxed under Internal Revenue...,831 b captive,captive taxed internal revenue code 831 b prov...,8,
2815,10 10 Rule,The issue of analyzing and demonstrating risk ...,10 10 rule,issue analyzing demonstrating risk transfer pr...,1,


In [160]:
row_index = df_insurance_terms[df_insurance_terms['group'].isnull()].index

In [161]:
df_insurance_terms.loc[row_index, 'group'] = 27

In [162]:
df_insurance_terms.iloc[row_index]

Unnamed: 0,term,text,new_term,new_text,alphabet_group,group
801,401(k) plan,The most common type of defined contribution r...,401 k plan,common type defined contribution retirement pl...,4,27.0
802,401(k) fee claims,Claims alleging that the individuals responsib...,401 k fee claims,claims alleging individuals responsible admini...,4,27.0
1873,831(b) captive,A captive that is taxed under Internal Revenue...,831 b captive,captive taxed internal revenue code 831 b prov...,8,27.0
2815,10 10 Rule,The issue of analyzing and demonstrating risk ...,10 10 rule,issue analyzing demonstrating risk transfer pr...,1,27.0


In [164]:
df_insurance_terms.loc['group'] = df_insurance_terms['group'].astype(int)

Unnamed: 0,term,text,new_term,new_text,alphabet_group,group
0,automatic premium loan,An optional provision in life insurance that a...,automatic premium loan,optional provision life insurance authorizes i...,a,1.0
1,Household Goods Transportation Act of 1980,Provided a nonjudicial dispute settlement prog...,household goods transportation act 1980,provided nonjudicial dispute settlement progra...,h,8.0
2,hydrocarbons,A class of organic compounds composed only of ...,hydrocarbons,class organic compounds composed carbon hydrog...,h,8.0
3,hydraulic fracturing (fracking),A process in which fractures in hard-to-reach ...,hydraulic fracturing fracking,process fractures hard reach shale rock format...,h,8.0
4,hybrid plans,Risk financing techniques that are a combinati...,hybrid plans,risk financing techniques combination retentio...,h,8.0
