In [1]:
import pandas as pd
import numpy as np
import os

# Load and Merge Data
- want to see cycle data

In [2]:
fp = os.path.join('..', 'data_out', 'Symptoms_clean.csv')
symptoms =pd. read_csv(fp)
#set zeroes as nans
symptoms.replace(0, np.NaN, inplace=True)
symptoms.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13512 entries, 0 to 13511
Data columns (total 14 columns):
symptom_id    13512 non-null int64
user_id       13512 non-null int64
acne          4230 non-null float64
backache      5027 non-null float64
bloating      5278 non-null float64
cramp         6943 non-null float64
diarrhea      2233 non-null float64
dizzy         2170 non-null float64
headache      4402 non-null float64
mood          5177 non-null float64
nausea        2194 non-null float64
sore          4090 non-null float64
date          13512 non-null object
date_fix      13511 non-null object
dtypes: float64(10), int64(2), object(2)
memory usage: 1.4+ MB


#### Agg Symptoms

In [3]:
categories = ['acne', 'backache', 'bloating', 'cramp',
       'diarrhea', 'dizzy', 'headache', 'mood', 'nausea', 'sore']
agg_symptoms = symptoms[categories].agg(['mean', 'count']).T
agg_symptoms.sort_values('count', ascending=False)
agg_symptoms.reset_index().plot.bar(x='index', y='count', title='Top Reported Symptoms')

<matplotlib.axes._subplots.AxesSubplot at 0xeee64e0>

#### Agg to the User Level

In [4]:
users_agg_symptoms = symptoms.groupby(['user_id'], as_index=False)[categories].mean()
users_agg_symptoms.sample(5)

Unnamed: 0,user_id,acne,backache,bloating,cramp,diarrhea,dizzy,headache,mood,nausea,sore
1906,3608,89.0,91.0,90.0,93.0,100.0,98.0,100.0,100.0,100.0,100.0
2766,5025,55.0,,100.0,100.0,100.0,,,52.0,,
3376,6066,,52.0,,,53.0,,19.0,84.0,,22.0
3550,6356,92.0,87.0,88.0,100.0,,75.0,79.0,99.0,,91.0
1327,2648,,,,,,,,,,


#### Cycle Data

In [5]:
fp = os.path.join('..', 'data_out', 'Clean_periods_clean.csv')
periods = pd.read_csv(fp, parse_dates=['start_date_fix', 'end_date_fix'])
periods.length = pd.to_timedelta(periods.length, unit='days')
periods.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31964 entries, 0 to 31963
Data columns (total 7 columns):
period_id         31964 non-null int64
start_date        31964 non-null object
end_date          31964 non-null object
user_id           31964 non-null int64
start_date_fix    31964 non-null datetime64[ns]
end_date_fix      31964 non-null datetime64[ns]
length            31964 non-null timedelta64[ns]
dtypes: datetime64[ns](2), int64(2), object(2), timedelta64[ns](1)
memory usage: 1.7+ MB


#### create cycle lengths
- drop any cycle with length over 45 days

In [6]:
print('predrop:', periods.shape[0])
periods_multi_respos = periods.groupby('user_id').filter(lambda x: x.shape[0] > 1).copy()
print('postdrop:', periods_multi_respos.shape[0])
#sort
periods_multi_respos.sort_values(['user_id', 'start_date_fix'], inplace=True)
#roll column
periods_multi_respos['previous_start_date'] = periods_multi_respos.start_date_fix.shift(1)
#calc difference
periods_multi_respos['cycle_length'] = periods_multi_respos['start_date_fix'] - periods_multi_respos['previous_start_date']
#set non repeated user it 
cumcount = periods_multi_respos.groupby('user_id').cumcount()
periods_multi_respos.loc[cumcount==0, 'cycle_length'] = np.NaN
#drop missing data
periods_multi_respos.dropna(subset=['cycle_length'], inplace=True)
#check cycle_length stats
print('stats pre drop:', periods_multi_respos.cycle_length.describe(), sep='\n')
#drop any cycle ove 45 days
periods_multi_respos = periods_multi_respos[periods_multi_respos.cycle_length < pd.to_timedelta(45, unit='days')]
print('stats post drop:', periods_multi_respos.cycle_length.describe(), sep='\n')
#create cycle df
select = ['user_id', 'previous_start_date', 'cycle_length']
cycles = periods_multi_respos[select].copy()
cycles.rename(columns={'previous_start_date': 'cycle_start_date'}, inplace=True)
fp = os.path.join('..', 'data_out', 'cycles.csv')
cycles.to_csv(fp, index=False)
cycles.sample(5)

predrop: 31964
postdrop: 28549
stats pre drop:
count                      25377
mean     36 days 18:27:59.347440
std      34 days 18:54:53.149531
min              0 days 00:00:00
25%             27 days 00:00:00
50%             29 days 00:00:00
75%             34 days 00:00:00
max            947 days 00:00:00
Name: cycle_length, dtype: object
stats post drop:
count                      21962
mean     28 days 18:00:25.571441
std       5 days 07:30:12.911247
min              0 days 00:00:00
25%             26 days 00:00:00
50%             28 days 00:00:00
75%             31 days 00:00:00
max             44 days 00:00:00
Name: cycle_length, dtype: object


Unnamed: 0,user_id,cycle_start_date,cycle_length
4279,818,2016-01-19,26 days
12830,4539,2016-08-07,26 days
10947,3315,2016-08-25,24 days
16169,2754,2017-03-09,28 days
7632,3158,2016-05-05,27 days


# Get data to add to data set

In [7]:
fp = os.path.join('..', 'data_out', 'User_clean.csv')
users = pd.read_csv(fp)
users = users[['user_id', 'cycle_length_initial', 'period_length_initial']]
users.columns = ['user_id', 'expected_cycle_length', 'expected_period_length']
#expected dates as timedeltas
users.expected_cycle_length = pd.to_timedelta(users.expected_cycle_length, unit='days')
users.expected_period_length = pd.to_timedelta(users.expected_period_length, unit='days')
users.sample(5)

Unnamed: 0,user_id,expected_cycle_length,expected_period_length
1976,2303,28 days,2 days
1099,1426,28 days,6 days
2804,3131,30 days,3 days
3660,3987,28 days,3 days
2928,3255,29 days,5 days


#### get average period length

In [None]:
mean_period_days = pd.to_timedelta(periods.groupby('user_id').apply(lambda x: x.length.dt.days.mean()), unit='days').reset_index()
mean_period_days.columns = ['user_id', 'mean_period_length']
mean_period_days.sample(5)

#### get average cycle

In [None]:
mean_cycle_days = pd.to_timedelta(cycles.groupby('user_id')['cycle_length'].apply(lambda x: x.dt.days.mean()), unit='days').reset_index()
mean_cycle_days.columns = ['user_id', 'mean_cycle_length']
mean_cycle_days.sample(5)

#### format data for modeling
- data adds, symptoms, expected values

In [None]:
print('premerge:', cycles.shape)
model_sample = cycles.merge(users, on='user_id', how='left')
model_sample = model_sample.merge(mean_period_days, on='user_id', how='left')
model_sample = model_sample.merge(mean_cycle_days, on='user_id', how='left')
model_sample = model_sample.merge(users_agg_symptoms, on='user_id', how='left')
print('postmerge:', model_sample.shape)
fp = os.path.join('..', 'data_out', 'model_sampl.csv')
model_sample.to_csv(fp, index=False)
model_sample.sample(5)

#### Error

In [None]:
np.abs(model_sample.cycle_length - model_sample.expected_cycle_length).describe()

#### Look at good gap in data

In [None]:
days = cycles.cycle_length.dt.days
days = days[(days > 0 ) & (days < 100)]
days.plot.hist()