In [1]:
# arch -x86_64 brew install <package>

In [1]:
import pandas as pd
from duration import *

In [2]:
# df = pd.read_csv('/Users/jesiscatandi/Dropbox (Holmusk)/NLP - Wei Ching/dap_history.csv')
df = pd.read_csv('dap_history.csv')
print(df.shape)
print(df.dtypes)
df.head()

(786741, 9)
ednum            int64
happening       object
ageinc          object
siteid          object
abstn6mths      object
agereg          object
abstnlongest    object
agefirst        object
backgroundid     int64
dtype: object


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,ednum,happening,ageinc,siteid,abstn6mths,agereg,abstnlongest,agefirst,backgroundid
0,457814,Post partum contributed to depression and incr...,33.0,ArapahoeHouse,,33.0,,15,14264
1,732698,,8.0,ArapahoeHouse,6 months,25.0,8 years,17,30582
2,1319897,NOTHING DIFFERENT,21.0,ArapahoeHouse,1 month,18.0,3 months,12,36136
3,311139,NOTHING,21.0,ArapahoeHouse,3 months,21.0,3 months,16,29947
4,988020,,,ArapahoeHouse,,,,18,28172


### abstn6mths

50 unique values cover 96.1%

Variations dealt with

- '6 Days'
- '3 1/2 - 4 months'

Variations to deal with

- 'since 4/26/2009'

Other things to consider

- How to speed up processing 
- How to deal with no units

Assumptions
- Any 1/2 is considered as 0.5 (even when it appears as 11/2 -- this is likely 1 1/2)
- Intervals are converted into "average"

In [3]:
# Cases

# No units
print('\n(1) Cases with no unit\n=======')
for x in ['NaN', '128']:
    print(f'{x:20s}', transform_duration(x))

# 1 unit
print('\n(2) Cases with a unit\n=======')
for x in ['2 days', '2 to 4 days', '11/2 day', '3 1/2 - 4 months', '6-months',
          ', 4 days', '2 or 3 weeks', 'A day or two', 'a day.',
          '1 and 1/2 day', 'one and a half day', 'one and a half thousand days', 
            '1 and a half day', # cannot take into account string with numbers as mixture of int and words
            'a day an half'
         ]:
    print(f'{x:20s}', transform_duration(x))

# 2 units
print('\n(3) Cases with more than one unit\n=======')
for x in ['1 day to 2 weeks', '1 year 6 months', '5 months,3week']:
    print(f'{x:20s}', transform_duration(x))

# Not considered
print('\n(4) Cases not accounted for\n=======')
for x in ['a few weeks', 'last 12 hours', '1weekend']:
    print(f'{x:20s}', transform_duration(x))


(1) Cases with no unit
NaN                  (0.0, 'unidentified')
128                  (0.0, 'unidentified')

(2) Cases with a unit
2 days               (2.0, 'day')
2 to 4 days          (3.0, 'day')
11/2 day             (1.5, 'day')
3 1/2 - 4 months     (3.75, 'month')
, 4 days             (4.0, 'day')
2 or 3 weeks         (2.5, 'week')
A day or two         (1.5, 'day')
a day.               (1.0, 'day')
1 and 1/2 day        (1.5, 'day')
one and a half day   (1.5, 'day')
one and a half thousand days (1500.0, 'day')
1 and a half day     (0.0, 'unidentified')
6-months             (6.0, 'month')
a day an half        (0.0, 'unidentified')

(3) Cases with more than one unit
1 day to 2 weeks     (7.5, 'day')
1 year 6 months      (545.0, 'day')
5 months,3week       (171.0, 'day')

(4) Cases not accounted for
a few weeks          (0.0, 'unidentified')
last 12 hours        (0.0, 'unidentified')
1weekend             (0.0, 'unidentified')


In [None]:
# convert number text to int
# do not split into individual words, cannot take into account strings with a mixture of int and str
# one and a half thousand --> 'one', 'a half thousand' --> 'one', 'half thousand' --> 'one', 'half', 'thousand'
    # --> 1, 0.5, 0 [DONE]

# mixture of int and str 
# 1 to two --> '1', 'two' [CANNOT SOLVE]


# comma 
# '1, 2 weeks' --> 1.5 weeks (average -- 1 unit) [DONE]
# '1 month, 1 week' --> 37 days (sum -- 2 units) [DONE]


# 3,4 weeks --> 3.5 w
# 5 months,3week --> 23 w

# regex (at the top) 6-months --> 6 months

In [None]:
# v1) Remove commas and dots at the start and end of sentence
# v2) Add "or" for averaging
# v3) For cases with 2 units, check if regex "^[0-9]* (unit) [0-9]* (unit)$" --> split to 2 chunks
# v4) Add ',' and treat it as  "and" to Case (3)

In [4]:
df_abstn6mths = pd.DataFrame(data=df['abstn6mths'].unique(), columns=['abstn6mths'])
df_abstn6mths.head()

Unnamed: 0,abstn6mths
0,
1,6 months
2,1 month
3,3 months
4,1 Week


In [9]:
# df_abstn6mths['transformed'] = df_abstn6mths['abstn6mths'].str.lower()
df_abstn6mths['transformed'] = df_abstn6mths.apply(lambda x: transform_duration(str(x['abstn6mths']).lower()), axis=1)
df_abstn6mths[['abstn6mths_N', 'abstn6mths_unit']] = pd.DataFrame(df_abstn6mths['transformed'].tolist(), index=df_abstn6mths.index)
display(df_abstn6mths['abstn6mths_unit'].value_counts())
display(df_abstn6mths[df_abstn6mths['abstn6mths_unit'] == 'unidentified'].iloc[100:])

unidentified    111
day              63
month            50
week             34
year              6
hour              2
Name: abstn6mths_unit, dtype: int64

Unnamed: 0,abstn6mths,transformed,abstn6mths_N,abstn6mths_unit
230,since 4/26/2009,"(0.0, unidentified)",0.0,unidentified
234,a couple weeks,"(0.0, unidentified)",0.0,unidentified
245,since arrest,"(0.0, unidentified)",0.0,unidentified
247,at least 30 day,"(0.0, unidentified)",0.0,unidentified
255,couple days,"(0.0, unidentified)",0.0,unidentified
258,Couple months,"(0.0, unidentified)",0.0,unidentified
259,2 months (court,"(0.0, unidentified)",0.0,unidentified
262,(including alco,"(0.0, unidentified)",0.0,unidentified
263,35 days while i,"(0.0, unidentified)",0.0,unidentified
264,(while in jail,"(0.0, unidentified)",0.0,unidentified


In [9]:
df[df['abstn6mths'].str.contains("93", na=False)]

Unnamed: 0,ednum,happening,ageinc,siteid,abstn6mths,agereg,abstnlongest,agefirst,backgroundid
9371,1235165,I got the hell beat out of me by my fiance. I...,22,ArapahoeHouse,93 days,14,93 days,14,7656
9486,302060,I got the hell beat out of me by my fiance. I...,22,ArapahoeHouse,93 days,14,93 days,14,7656
9577,741431,I got the hell beat out of me by my fiance. I...,22,ArapahoeHouse,93 days,14,93 days,14,7656
9644,785316,I got the hell beat out of me by my fiance. I...,22,ArapahoeHouse,93 days,14,93 days,14,7656
9729,356465,I got the hell beat out of me by my fiance. I...,22,ArapahoeHouse,93 days,14,93 days,14,7656
...,...,...,...,...,...,...,...,...,...
28401,1288419,I got the hell beat out of me by my fiance. I...,22,ArapahoeHouse,93 days,14,93 days,14,7656
28680,323608,I got the hell beat out of me by my fiance. I...,22,ArapahoeHouse,93 days,14,93 days,14,7656
28715,393159,I got the hell beat out of me by my fiance. I...,22,ArapahoeHouse,93 days,14,93 days,14,7656
29180,244119,I got the hell beat out of me by my fiance. I...,22,ArapahoeHouse,93 days,14,93 days,14,7656


#### Units

### abstnlongest


70 unique values cover 95.7%

Length of abstinence period

In [11]:
df_abstnlongest = pd.DataFrame(data=df['abstnlongest'].unique(), columns=['abstnlongest'])
df_abstnlongest.head()

df_abstnlongest['transformed'] = df_abstnlongest.apply(lambda x: transform_duration(str(x['abstnlongest']).lower()), axis=1)
df_abstnlongest[['abstnlongest_N', 'abstnlongest_unit']] = pd.DataFrame(df_abstnlongest['transformed'].tolist(), index=df_abstnlongest.index)
display(df_abstnlongest['abstnlongest_unit'].value_counts())



unidentified    146
year             92
month            63
day              44
week             25
hour              1
Name: abstnlongest_unit, dtype: int64

In [16]:
display(df_abstnlongest[df_abstnlongest['abstnlongest_unit'] == 'unidentified'].iloc[100:])


Unnamed: 0,abstnlongest,transformed,abstnlongest_N,abstnlongest_unit
275,8 day hospitali,"(0.0, unidentified)",0.0,unidentified
276,28 days current,"(0.0, unidentified)",0.0,unidentified
278,Over one year,"(0.0, unidentified)",0.0,unidentified
280,Unknown - pt po,"(0.0, unidentified)",0.0,unidentified
281,64days in trea,"(0.0, unidentified)",0.0,unidentified
282,Not known,"(0.0, unidentified)",0.0,unidentified
283,now,"(0.0, unidentified)",0.0,unidentified
284,less than aweek,"(0.0, unidentified)",0.0,unidentified
287,5 yrs.,"(0.0, unidentified)",0.0,unidentified
288,one or two days,"(0.0, unidentified)",0.0,unidentified


### happening

2000 unique values cover 96.8%

In [14]:
df_happening = pd.DataFrame(data=df['happening'].unique(), columns=['happening'])
df_happening.head()

Unnamed: 0,happening
0,Post partum contributed to depression and incr...
1,
2,NOTHING DIFFERENT
3,NOTHING
4,NO LONGER HAD OVER THE ROAD TRUCK DRIVING JOB ...


In [15]:
import stressors

In [16]:
config = {
    'embedding': '/home/weiching/Dropbox (Holmusk)/NLP - Wei Ching/GoogleNews-vectors-optimized.bin'
}

model = stressors.label(config)

In [17]:
data = []
for x in df_happening[['happening']].values.tolist():
    data.append([None,x[0]])

In [18]:
data[:20]

[[None, 'Post partum contributed to depression and increase in drinking.'],
 [None, nan],
 [None, 'NOTHING DIFFERENT'],
 [None, 'NOTHING'],
 [None,
  'NO LONGER HAD OVER THE ROAD TRUCK DRIVING JOB AND WAS HOME EVERY EVENING'],
 [None, 'CLIENT STATES IS ONLY A SOCIAL DRINKER'],
 [None, 'IN THE SERVICE AND SEEING PEOPLE KILLED, BLOWN UP'],
 [None, "it appears clt. doesn't want to give much information"],
 [None, 'Brother was murdered'],
 [None, 'PARTIES WITH FRIENDS AT 16'],
 [None, 'FAMILY PROBLEMS'],
 [None, 'NOTHING REALLY'],
 [None, 'nothing'],
 [None, 'CLT STS DID NOT KNOW'],
 [None, 'father passed away, stressed'],
 [None, 'CROWD CLT HUNG OUT WITH'],
 [None, 'father committed suicide'],
 [None, 'went to college'],
 [None, 'Loss of good jobs and  alot of time on  his hand.'],
 [None,
  "A lot of responsibility helping to raise younger brother because Mom was a single and Dad wans't rea"]]

In [19]:
output, flags = model.generate(data)

print('\nDefault output')
print(output[:20])

print('\nValidated flag')
print(flags[:20])

print('\nOutput labels')
print(model.convert(output[:20]))


Default output
[[], [], [], [], [], [], [], [], [13], [], [], [], [], [], [13], [], [13], [], [64, 66], []]

Validated flag
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

Output labels
[[], [], [], [], [], [], [], [], ['Bereavement'], [], [], [], [], [], ['Bereavement'], [], ['Bereavement'], [], ['Unstable job (changing jobs often)', 'Unemployment'], []]


In [28]:
df_happening['default_output'] = output
df_happening['validated_flag'] = flags
df_happening['output_labels'] = model.convert(output)

In [33]:
df_merged = pd.merge(df, df_happening, on=['happening','happening'])
df_merged.head()

Unnamed: 0,ednum,happening,ageinc,siteid,abstn6mths,agereg,abstnlongest,agefirst,backgroundid,default_output,validated_flag,output_labels
0,457814,Post partum contributed to depression and incr...,33.0,ArapahoeHouse,,33.0,,15,14264,[],0.0,[]
1,732698,,8.0,ArapahoeHouse,6 months,25.0,8 years,17,30582,[],0.0,[]
2,988020,,,ArapahoeHouse,,,,18,28172,[],0.0,[]
3,687950,,30.0,ArapahoeHouse,,22.0,,22,32263,[],0.0,[]
4,127936,,30.0,ArapahoeHouse,,22.0,,22,32263,[],0.0,[]


In [35]:
df_merged.to_csv('dap_history_labels.csv',index=False)

In [88]:
df_labels = pd.read_csv('dap_history_labels.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [91]:
print(df_labels.shape)

(786741, 12)


In [90]:
df_labels['default_output'].value_counts()

[]          769423
[11]          4316
[13]          2662
[56]          2057
[66]           967
             ...  
[45]             1
[44]             1
[13, 51]         1
[11, 66]         1
[60]             1
Name: default_output, Length: 74, dtype: int64

In [89]:
df_labels['default_output'].value_counts(normalize=True)

[]          0.977988
[11]        0.005486
[13]        0.003384
[56]        0.002615
[66]        0.001229
              ...   
[45]        0.000001
[44]        0.000001
[13, 51]    0.000001
[11, 66]    0.000001
[60]        0.000001
Name: default_output, Length: 74, dtype: float64

In [95]:
df_labels_unique = df_labels[['happening', 'default_output', 'output_labels']].drop_duplicates()

In [122]:
df_labels_unique['output_len'] = df_labels_unique.apply(lambda x:len([i for i in x['default_output'].replace('[','').replace(']','').split(',') if i]), axis=1)
df_labels_unique

Unnamed: 0,happening,default_output,output_labels,output_len
0,Post partum contributed to depression and incr...,[],[],0
1,,[],[],0
673136,NOTHING DIFFERENT,[],[],0
673137,NOTHING,[],[],0
673293,NO LONGER HAD OVER THE ROAD TRUCK DRIVING JOB ...,[],[],0
...,...,...,...,...
786437,Pt. didn't explain.,[],[],0
786559,loss of mother,[],[],0
786684,felt lonely,[],[],0
786690,school and family problems,[],[],0


In [130]:
df_labels_unique[df_labels_unique.output_len == 0].iloc[:50]

Unnamed: 0,happening,default_output,output_labels,output_len
0,Post partum contributed to depression and incr...,[],[],0
1,,[],[],0
673136,NOTHING DIFFERENT,[],[],0
673137,NOTHING,[],[],0
673293,NO LONGER HAD OVER THE ROAD TRUCK DRIVING JOB ...,[],[],0
673294,CLIENT STATES IS ONLY A SOCIAL DRINKER,[],[],0
673295,"IN THE SERVICE AND SEEING PEOPLE KILLED, BLOWN UP",[],[],0
673299,it appears clt. doesn't want to give much info...,[],[],0
673302,PARTIES WITH FRIENDS AT 16,[],[],0
673303,FAMILY PROBLEMS,[],[],0


In [121]:
len([x for x in df_labels_unique['default_output'].iloc[-1].replace('[','').replace(']','').split(',') if x])

1