# Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

### Load the Training CSV

In [2]:
data = pd.read_csv('datasets/train_merge_total.csv', sep=',')
print("Columns:", data.columns)
print("Initially", len(data), "rows")

data = data[np.isfinite(data['rhoticity1'])]
data = data[np.isfinite(data['rhoticity2'])]

print("Filtered NaN labels to", len(data), "rows")

data['rhoticity1'] = data['rhoticity1'].astype(int)
data['rhoticity2'] = data['rhoticity2'].astype(int)
data.loc[data['vowel_type'].isna(), 'vowel_type'] = 'none'
data.head()



  interactivity=interactivity, compiler=compiler, result=result)


Columns: Index(['speaker', 'token', 'word', 'norm_time', 'time', 'c1', 'c2', 'c3', 'c4',
       'c5', 'c6', 'c7', 'c8', 'c9', 'c10', 'c11', 'c12', 'rhoticity1',
       'rhoticity2', 'dep_variable', 'sex', 'birth_year', 'pre_segment',
       'fol_segment', 'vowelR', 'vowel_type', 'time_y', 'F1', 'F2', 'F3', 'Z2',
       'Z3', 'Z2-Z3', 'Z*'],
      dtype='object')
Initially 494833 rows
Filtered NaN labels to 494833 rows


Unnamed: 0,speaker,token,word,norm_time,time,c1,c2,c3,c4,c5,...,vowelR,vowel_type,time_y,F1,F2,F3,Z2,Z3,Z2-Z3,Z*
0,Boston2015-16_118,Boston2015-16_118_short1,short,0.01,13.1651,-1267.561536,-1209.016526,-829.048466,-508.252388,-305.130238,...,AO1R,back,13.1651,377.5567,1377.9991,2261.2983,10.4723,13.7191,-3.2468,12.0957
1,Boston2015-16_118,Boston2015-16_118_short1,short,0.02,13.1663,-1196.413804,-1264.286082,-868.443646,-557.983277,-281.922436,...,AO1R,back,13.1662,379.5048,1375.4122,2253.6803,10.4604,13.6967,-3.2363,12.0785
2,Boston2015-16_118,Boston2015-16_118_short1,short,0.03,13.1674,-1187.723367,-1449.34661,-872.610353,-503.566163,-289.390192,...,AO1R,back,13.1673,381.7783,1370.6928,2255.9149,10.4385,13.7033,-3.2648,12.0709
3,Boston2015-16_118,Boston2015-16_118_short1,short,0.04,13.1685,-1186.670853,-1331.14999,-884.239085,-524.813311,-311.409256,...,AO1R,back,13.1684,384.0517,1365.9734,2258.1496,10.4166,13.71,-3.2933,12.0633
4,Boston2015-16_118,Boston2015-16_118_short1,short,0.05,13.1696,-1187.932907,-1278.971283,-861.975815,-561.425275,-310.147997,...,AO1R,back,13.1695,386.3252,1361.254,2260.3842,10.3948,13.7166,-3.3219,12.0557


### Filter Boarder Cases

In [3]:
print("Filtering out", len(data.loc[data['dep_variable'] == 0.5]), "border cases")
data = data.loc[data['dep_variable'] != 0.5]

data['dep_variable'] = data['dep_variable'].astype(int)

print("Rhotic Token Count: " + str(len(data.loc[data['dep_variable'] == 1])))
print("Non-Rhotic Token Count: " + str(len(data.loc[data['dep_variable'] == 0])))

Filtering out 72422 border cases
Rhotic Token Count: 269685
Non-Rhotic Token Count: 152726


### Encode Categorical Variables

In [4]:
# make categorical variables numeric
encoder = LabelEncoder()
# encoder.classes_ = numpy.load('arrays/style_classes.npy')
# data['speech_style'] = encoder.fit_transform(data['style'].astype(str))

encoder.classes_ = np.load('arrays/preseg_classes.npy')
print(encoder.classes_)
data['pre_segment'] = encoder.transform(data['pre_segment'].astype(str))

encoder.classes_ = np.load('arrays/folseg_classes.npy')
data['fol_segment'] = encoder.transform(data['fol_segment'].astype(str))

encoder.classes_ = np.load('arrays/vowelR_classes.npy')
data['vowelR'] = encoder.transform(data['vowelR'].astype(str))

encoder.classes_ = np.load('arrays/vowel_type_classes.npy')
data['vowel_type'] = encoder.transform(data['vowel_type'].astype(str))
data.head(21)

['AW1' 'AY1' 'B' 'CH' 'D' 'DH' 'ER0' 'F' 'G' 'H' 'HH' 'IY0' 'IY1' 'K' 'L'
 'M' 'N' 'NG' 'OW1' 'P' 'R' 'S' 'SH' 'T' 'UW1' 'V' 'W' 'Y' 'Z' 'nan' 'sp']


Unnamed: 0,speaker,token,word,norm_time,time,c1,c2,c3,c4,c5,...,vowelR,vowel_type,time_y,F1,F2,F3,Z2,Z3,Z2-Z3,Z*
0,Boston2015-16_118,Boston2015-16_118_short1,short,0.01,13.1651,-1267.561536,-1209.016526,-829.048466,-508.252388,-305.130238,...,3,0,13.1651,377.5567,1377.9991,2261.2983,10.4723,13.7191,-3.2468,12.0957
1,Boston2015-16_118,Boston2015-16_118_short1,short,0.02,13.1663,-1196.413804,-1264.286082,-868.443646,-557.983277,-281.922436,...,3,0,13.1662,379.5048,1375.4122,2253.6803,10.4604,13.6967,-3.2363,12.0785
2,Boston2015-16_118,Boston2015-16_118_short1,short,0.03,13.1674,-1187.723367,-1449.34661,-872.610353,-503.566163,-289.390192,...,3,0,13.1673,381.7783,1370.6928,2255.9149,10.4385,13.7033,-3.2648,12.0709
3,Boston2015-16_118,Boston2015-16_118_short1,short,0.04,13.1685,-1186.670853,-1331.14999,-884.239085,-524.813311,-311.409256,...,3,0,13.1684,384.0517,1365.9734,2258.1496,10.4166,13.71,-3.2933,12.0633
4,Boston2015-16_118,Boston2015-16_118_short1,short,0.05,13.1696,-1187.932907,-1278.971283,-861.975815,-561.425275,-310.147997,...,3,0,13.1695,386.3252,1361.254,2260.3842,10.3948,13.7166,-3.3219,12.0557
5,Boston2015-16_118,Boston2015-16_118_short1,short,0.06,13.1707,-1139.979275,-1338.346039,-860.908067,-549.633166,-295.748252,...,3,0,13.1706,388.5986,1356.5346,2262.6189,10.3729,13.7233,-3.3504,12.0481
6,Boston2015-16_118,Boston2015-16_118_short1,short,0.07,13.1718,-1227.769009,-1315.73834,-876.046671,-508.831386,-297.988347,...,3,0,13.1717,390.872,1351.8152,2264.8535,10.351,13.73,-3.3789,12.0405
7,Boston2015-16_118,Boston2015-16_118_short1,short,0.08,13.1729,-1238.242613,-1296.144049,-867.605982,-463.705142,-276.136601,...,3,0,13.1728,392.5502,1342.0305,2261.8786,10.3045,13.721,-3.4165,12.0128
8,Boston2015-16_118,Boston2015-16_118_short1,short,0.09,13.174,-1149.09948,-1261.944199,-933.018145,-523.16595,-262.013038,...,3,0,13.1739,393.8535,1329.0564,2255.6236,10.2425,13.7023,-3.4598,11.9724
9,Boston2015-16_118,Boston2015-16_118_short1,short,0.11,13.1763,-1132.916286,-1412.198436,-878.328958,-543.164332,-304.298815,...,3,0,13.1761,396.4602,1303.1082,2243.1135,10.1186,13.6649,-3.5464,11.8917


### Process Time Series

In [5]:
token_length = 100
num_features = 8

def windows(data):
    start = 0
    end = 0
    this_token = data.iloc[end]['token']
    while start < len(data):
        while end < len(data) and data.iloc[end]['token'] == this_token:
            end += 1
        yield start, end
        start = end
        if end >= len(data):
            break
        this_token = data.iloc[end]['token']
        
def get_speech_tokens(data, window_size=21, num_features=3):
    new_data = pd.DataFrame()  # dataframe to store categorical data
    tokens = np.empty((0, window_size, num_features))
    labels = np.empty((0))
    for start, end in windows(data):
        
        z2 = data.iloc[start:end]['Z2'].astype(float)
        z3 = data.iloc[start:end]['Z3'].astype(float)
        z2_z3 = data.iloc[start:end]['Z2-Z3'].astype(float)
        z_star = data.iloc[start:end]['Z*'].astype(float)

        
#         style = np.array(data.iloc[start:end]['speech_style']).astype(int)
        pre_seg = np.array(data.iloc[start:end]['pre_segment']).astype(int)
        fol_seg = np.array(data.iloc[start:end]['fol_segment']).astype(int)
        vowelR = np.array(data.iloc[start:end]['vowelR']).astype(int)
        vowel_type = np.array(data.iloc[start:end]['vowel_type']).astype(int)
        
        if 0 < len(z2) < window_size:
            difference = window_size - len(z2)
            padding = np.array([0] * difference)
            z2 = np.hstack([padding, z2])
            z3 = np.hstack([padding, z3])
            z2_z3 = np.hstack([padding, z2_z3])
            z_star = np.hstack([padding, z_star])
            
#             style = np.hstack([padding, style])
            pre_seg = np.hstack([padding, pre_seg])
            fol_seg = np.hstack([padding, fol_seg])
            vowelR = np.hstack([padding, vowelR])
            vowel_type = np.hstack([padding, vowel_type])
            
        
        if len(z2) == window_size:
            # get continuous data
            feature_stack = np.dstack([z2, z3, z2_z3, z_star,
                                      pre_seg, fol_seg, vowelR, vowel_type]) #, style])
            tokens = np.vstack([tokens, feature_stack])
            labels = np.append(labels, stats.mode(data.iloc[start:end]['dep_variable'])[0][0])

    return tokens, labels

### Perform Preprocesssing and Reshape the Data

In [6]:
segments, y = get_speech_tokens(data, token_length, num_features)
print("Segment shape:", segments.shape)
X = segments.reshape((len(segments), token_length, num_features))
print("X shape:", X.shape)
print("y shape:", y.shape)

Segment shape: (3000, 100, 8)
X shape: (3000, 100, 8)
y shape: (3000,)


### Save Arrays to File

In [7]:
np.save("arrays/X_train_bark.npy", X, allow_pickle=False)
np.save("arrays/y_train_bark.npy", y, allow_pickle=False)