In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("../data/rmh_raw.csv")
df.shape

(466605, 5)

In [None]:
# # The dataset without spelling correction has records until mid 2019
# df_ = pd.read_csv("../data/spelling_correction/rmh_nospellcorr.csv")
# df_.shape

# (df_[:466605].text == df.text).all()

# (df_[:466605].text_clean == df.text_clean).all()

# df.text_clean = df_[:466605].text_clean

In [3]:
df.SH.value_counts(dropna=False)

0    460009
1      6596
Name: SH, dtype: int64

In [4]:
df.SI.value_counts(dropna=False)

NaN    372712
0.0     87634
1.0      6259
Name: SI, dtype: int64

In [5]:
df.SH.sum() / df.shape[0] * 100

1.413615370602544

**Create a holdout set**

In [6]:
df_ho = df[df.year==2018].copy()

In [7]:
df_ho.SH.value_counts(dropna=False)

0    75165
1     1218
Name: SH, dtype: int64

In [8]:
df_ho.SI.value_counts(dropna=False)

0.0    74871
1.0     1512
Name: SI, dtype: int64

In [9]:
df_ho[(df_ho.SH == 1) & (df_ho.SI == 1)].shape

(0, 5)

In [10]:
df_ho.SH.sum() / df_ho.shape[0] * 100

1.5945956561014885

In [11]:
df_ho.SI.sum() / df_ho.shape[0] * 100

1.9794980558501238

In [13]:
df_ho.to_csv("../data/rmh_raw_holdout.csv", index=False)

In [14]:
df.drop(df[df.year==2018].index, inplace=True)
df.reset_index(drop=True, inplace=True)

**Number and percentage of SH cases**

In [15]:
df.SH.value_counts(dropna=False)

0    384844
1      5378
Name: SH, dtype: int64

In [16]:
df.SH.sum() / df.shape[0] * 100

1.378189850905382

**Create a train/test split**

In [17]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, stratify=df.SH)

def size_mb(docs):
    return sum(len(s.encode('utf-8')) for s in docs) / 1e6

print("Data loaded")

print("The training set contains {} records ({:.3f}MB):".format(df_train.shape[0], size_mb(df_train)))
print(df_train.SH.value_counts())
print("\nThe test set contains {} records ({:.3f}MB):".format(df_test.shape[0], size_mb(df_test)))
print(df_test.SH.value_counts())

Data loaded
The training set contains 312177 records (0.000MB):
0    307875
1      4302
Name: SH, dtype: int64

The test set contains 78045 records (0.000MB):
0    76969
1     1076
Name: SH, dtype: int64


In [18]:
df_train.to_csv("../data/rmh_raw_train.csv", index=False)
df_test.to_csv("../data/rmh_raw_test.csv", index=False)