# Overview
This notebook reads in the Spanish, English, & Frech data, does a bunch of processing, and pickles the results for use by other (modeling) notebooks. 

In [1]:
## 
# Controls: use this cell to control global variables
langs = [x.lower() for x in ['En','Sp','Fr']] # which languages' data to load? (handles case)
subset_size = 'all' # how many rows (from each data set specified above) should we take (useful for debug)
#subset_size = 25000 # how many rows (from each data set specified above) should we take (useful for debug)
write_out = True # do we want to pickle the data for later use? (not currently implemented)
write_fn = 'EnEsFr_Data.pickle' # not currently implemented

In [2]:
# Original libraries
## with JL annotations
import argparse ## for dealing with command line args (it seems)
from collections import defaultdict, namedtuple ## useful data structs (defaultdict provides result if !exists key)
from io import open ## input/output 
import math ## duh
import os ## navigating the os
from random import shuffle, uniform ## duh

from future.builtins import range 
from future.utils import iteritems

# Sigma is the L2 prior variance, regularizing the baseline model. Smaller sigma means more regularization.
_DEFAULT_SIGMA = 20.0

# Eta is the learning rate/step size for SGD. Larger means larger step size.
_DEFAULT_ETA = 0.1


#Brendan and Jarrett libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#from googletrans import Translator
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pickle # save data to file (to avoid having to load 2mil lines every time)
import numpy as np
from nltk.metrics import edit_distance
from nltk.stem import SnowballStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import words
from nltk.metrics import edit_distance
from scipy.stats import zscore
from tqdm import tqdm # progress bar
#import warnings
#warnings.simplefilter("ignore")

pd.options.display.max_columns = 999
pd.options.display.max_rows = 1000



In [3]:
def load_data(filename):
    """
    This method loads and returns the data in filename. If the data is labelled training data, it returns labels too.

    Parameters:
        filename: the location of the training or test data you want to load.

    Returns:
        data: a list of InstanceData objects from that data type and track.
        labels (optional): if you specified training data, a dict of instance_id:label pairs.
    """

    # 'data' stores a list of 'InstanceData's as values.
    data = []

    # If this is training data, then 'labels' is a dict that contains instance_ids as keys and labels as values.
    training = False
    if filename.find('train') != -1:
        training = True

    if training:
        labels = dict()

    num_exercises = 0
    print('Loading instances...')

    with open(filename, 'rt') as f:
        for line in f:
            line = line.strip()

            # If there's nothing in the line, then we're done with the exercise. Print if needed, otherwise continue
            if len(line) == 0:
                num_exercises += 1
                if num_exercises % 100000 == 0:
                    print('Loaded ' + str(len(data)) + ' instances across ' + str(num_exercises) + ' exercises...')

            # If the line starts with #, then we're beginning a new exercise
            elif line[0] == '#':
                list_of_exercise_parameters = line[2:].split()
                instance_properties = dict()
                for exercise_parameter in list_of_exercise_parameters:
                    [key, value] = exercise_parameter.split(':')
                    if key == 'countries':
                        value = value.split('|')
                    elif key == 'days':
                        value = float(value)
                    elif key == 'time':
                        if value == 'null':
                            value = None
                        else:
                            assert '.' not in value
                            value = int(value)
                    instance_properties[key] = value

            # Otherwise we're parsing a new Instance for the current exercise
            else:
                line = line.split()
                if training:
                    assert len(line) == 7
                else:
                    assert len(line) == 6
                assert len(line[0]) == 12

                instance_properties['instance_id'] = line[0]

                instance_properties['token'] = line[1]
                instance_properties['part_of_speech'] = line[2]

                instance_properties['morphological_features'] = dict()
                for l in line[3].split('|'):
                    [key, value] = l.split('=')
                    if key == 'Person':
                        value = int(value)
                    instance_properties['morphological_features'][key] = value

                instance_properties['dependency_label'] = line[4]
                instance_properties['dependency_edge_head'] = int(line[5])
                if training:
                    label = float(line[6])
                    labels[instance_properties['instance_id']] = label
                data.append(InstanceData(instance_properties=instance_properties))

        print('Done loading ' + str(len(data)) + ' instances across ' + str(num_exercises) +
              ' exercises.\n')

    if training:
        return data, labels
    else:
        return data

class InstanceData(object):
    """
    A bare-bones class to store the included properties of each instance. This is meant to act as easy access to the
    data, and provides a launching point for deriving your own features from the data.
    """
    def __init__(self, instance_properties):

        # Parameters specific to this instance
        self.instance_id = instance_properties['instance_id']
        self.token = instance_properties['token']
        self.part_of_speech = instance_properties['part_of_speech']
        self.morphological_features = instance_properties['morphological_features']
        self.dependency_label = instance_properties['dependency_label']
        self.dependency_edge_head = instance_properties['dependency_edge_head']

        # Derived parameters specific to this instance
        self.exercise_index = int(self.instance_id[8:10])
        self.token_index = int(self.instance_id[10:12])

        # Derived parameters specific to this exercise
        self.exercise_id = self.instance_id[:10]

        # Parameters shared across the whole session
        self.user = instance_properties['user']
        self.countries = instance_properties['countries']
        self.days = instance_properties['days']
        self.client = instance_properties['client']
        self.session = instance_properties['session']
        self.format = instance_properties['format']
        self.time = instance_properties['time']

        # Derived parameters shared across the whole session
        self.session_id = self.instance_id[:8]
        
    def to_features(self):
        """
        Prepares those features that we wish to use in the LogisticRegression example in this file. We introduce a bias,
        and take a few included features to use. Note that this dict restructures the corresponding features of the
        input dictionary, 'instance_properties'.

        Returns:
            to_return: a representation of the features we'll use for logistic regression in a dict. A key/feature is a
                key/value pair of the original 'instance_properties' dict, and we encode this feature as 1.0 for 'hot'.
        """
        to_return = dict()

        to_return['bias'] = 1.0
        to_return['user:' + self.user] = 1.0
        to_return['format:' + self.format] = 1.0
        to_return['token:' + self.token.lower()] = 1.0

        to_return['part_of_speech:' + self.part_of_speech] = 1.0
        for morphological_feature in self.morphological_features:
            to_return['morphological_feature:' + morphological_feature] = 1.0
        to_return['dependency_label:' + self.dependency_label] = 1.0

        return to_return

In [4]:
# Given a dataset in duolingo format and a set of labels for each, produce a pandas df
def makeDF(instances, labels = 0, train = True):
    data = []
    for i in range(0,len(instances)):
        data.append(
            {'user': instances[i].user,
            'countries': ' '.join(instances[i].countries),
            'days': instances[i].days,
            'client': instances[i].client,
            'session': instances[i].session,
            'sessionID': instances[i].session_id,
            'format': instances[i].format,
            'time': instances[i].time,
            'exerciseID': instances[i].exercise_id,
            'instanceID': instances[i].instance_id,
            'tokenIndex': instances[i].token_index,
            'token': instances[i].token,
            'dependencyLbl': instances[i].dependency_label,
            'dependencyEdgeHead': instances[i].dependency_edge_head,
            'pos': instances[i].part_of_speech,
            'morpho': instances[i].morphological_features})
        hashID = data[i]['instanceID']
        if train == True:
            data[i]['error'] = labels[hashID]
    return(pd.DataFrame(data))

# Load Data 

In [5]:
## Load in the data using their pre-existing function & take subset if desired & pandafy

# function mapping l2 abbrev to name of L2 and where to find that data
def get_filepath(l2):
    l2 = l2.lower()[0:2] # make sure we have the lowercase first 2 letters of the language name
    return({
            'sp': {'name':'Spanish', 'path':'../Data/data_es_en/es_en.slam.20171218.train'}, # L1 English L2 Spanish
            'en': {'name':'English', 'path':'../Data/data_en_es/en_es.slam.20171218.train'}, # L1 Spanish L2 English
            'fr': {'name':'French',  'path':'../Data/data_fr_en/fr_en.slam.20171218.train'}}[l2]) # L1 English L2 French
def get_dev_filepath(l2):
    l2 = l2.lower()[0:2] # make sure we have the lowercase first 2 letters of the language name
    return({
            'sp': {'name':'Spanish', 'path':'../Data/data_es_en/es_en.slam.20171218.dev'}, # L1 English L2 Spanish
            'en': {'name':'English', 'path':'../Data/data_en_es/en_es.slam.20171218.dev'}, # L1 Spanish L2 English
            'fr': {'name':'French',  'path':'../Data/data_fr_en/fr_en.slam.20171218.dev'}}[l2]) # L1 English L2 French
def get_dev_key_filepath(l2):
    l2 = l2.lower()[0:2] # make sure we have the lowercase first 2 letters of the language name
    return({
            'sp': {'name':'Spanish', 'path':'../Data/data_es_en/es_en.slam.20171218.dev.key'}, # L1 English L2 Spanish
            'en': {'name':'English', 'path':'../Data/data_en_es/en_es.slam.20171218.dev.key'}, # L1 Spanish L2 English
            'fr': {'name':'French',  'path':'../Data/data_fr_en/fr_en.slam.20171218.dev.key'}}[l2]) # L1 English L2 French
def get_test_filepath(l2):
    l2 = l2.lower()[0:2] # make sure we have the lowercase first 2 letters of the language name
    return({
            'sp': {'name':'Spanish', 'path':'../Data/data_es_en/test.es_en'}, # L1 English L2 Spanish
            'en': {'name':'English', 'path':'../Data/data_en_es/test.en_es'}, # L1 Spanish L2 English
            'fr': {'name':'French',  'path':'../Data/data_fr_en/test.fr_en'}}[l2]) # L1 English L2 French
# Read in the data for desired language(s) and save it in a dictionary
data = {}
devData = {}
testData = {}
for l2 in langs:
    l2_info = get_filepath(l2)
    l2_name = l2_info['name']
    l2_path = l2_info['path']
    print('    Loading ' + l2_name + ' data...')
    Dat, Labs = load_data(l2_path)
    
    l2_info = get_dev_filepath(l2)
    l2_name = l2_info['name']
    l2_path = l2_info['path']
    print('    Loading ' + l2_name + ' dev data...')
    devDat = load_data(l2_path)
    
    l2_info = get_dev_key_filepath(l2)
    l2_name = l2_info['name']
    l2_path = l2_info['path']
    print('    Loading ' + l2_name + ' dev key data...')
    devKey = pd.read_csv(l2_path, sep = " ", header = None)
    devKey.columns = ['instanceID', 'error']
    
    l2_info = get_test_filepath(l2)
    l2_name = l2_info['name']
    l2_path = l2_info['path']
    print('    Loading ' + l2_name + ' test data...')
    testDat = load_data(l2_path)
    
    print('    Taking subset of size = ' + str(subset_size) + ' rows...', end = '')
    # take subset of data (easy)
    if subset_size != 'all':   
        Dat = Dat[0:subset_size]
        devDat = devDat[0:subset_size]
        testDat = testDat[0:subset_size]
    # find corresponding labels
    ids = []
    for t in Dat:
        ids.append(t.instance_id)
    Labs = {k: Labs[k] for k in (ids)}
    print(' Complete.')
    
    print('    Converting to a pandas dataframe ...', end ='')
    data[l2] = makeDF(Dat, Labs)
    data[l2]['source'] = 'train'
    devData[l2] = makeDF(devDat, train = False)
    devData[l2] = devData[l2].merge(devKey, on = "instanceID", how = 'left')
    devData[l2]['source'] = 'dev'
    testData[l2] = makeDF(testDat, train = False)
    testData[l2]['source'] = 'test'
    data[l2] = pd.concat([data[l2],devData[l2],testData[l2]],ignore_index = True)
    Dat, Labs,devDat,testDat = None, None, None, None# lighten huge memory load
    print( ' Complete.')
    print('Done loading ' + l2_name + ' data!\n')

print('Done Loading Data!')

    Loading English data...
Loading instances...
Loaded 317049 instances across 100000 exercises...
Loaded 635368 instances across 200000 exercises...
Loaded 951536 instances across 300000 exercises...
Loaded 1271940 instances across 400000 exercises...
Loaded 1591345 instances across 500000 exercises...
Loaded 1911213 instances across 600000 exercises...
Loaded 2227445 instances across 700000 exercises...
Loaded 2546705 instances across 800000 exercises...
Done loading 2622958 instances across 824012 exercises.

    Loading English dev data...
Loading instances...
Loaded 334439 instances across 100000 exercises...
Done loading 387374 instances across 115770 exercises.

    Loading English dev key data...
    Loading English test data...
Loading instances...
Loaded 337728 instances across 100000 exercises...
Done loading 386604 instances across 114586 exercises.

    Taking subset of size = 25000 rows... Complete.
    Converting to a pandas dataframe ... Complete.
Done loading English 

# Process Data

This is where we engineer our own features. These apply at different levels:

(fill this out)
- Dataset:
- User:
- Exercise:
    + sentence length
- Instance:

In [86]:
## Function defs ### 
# A function that takes a df representing a single exercise (sentence) and processes it
def prepExerDat(ed):
    pd.options.mode.chained_assignment = None  # default='warn' # this disables the pink warnings...
    nrows = ed.shape[0] # count the number of rows
    ed['sentLength'] = nrows
    ed['timePerToken'] = ed['time']/ed['sentLength']
    #ed['nErrorsSoFarSent'] = ed['error'].cumsum() - ed['error']
    #ed['madeErrorYet'] =  ed['nErrorsSoFarSent'] > 0
    pd.options.mode.chained_assignment = 'warn'  # default='warn' # reset the stupid warning thing
    return(ed) 

# A function that takes a df representing (all of) a single user's data and processes it
def prepUserDat(ud, morphos):
    pd.options.mode.chained_assignment = None  # default='warn' # this disables the pink warnings...
    nrows = ud.shape[0]
    #acc = 1 - ud['error'] # need to convert from error to accuracy (a = 1- e)
    
    ### do stuff that affects the whole user (e.g. mean accuracies, etc. )
    ud['userTrial'] = np.arange(1,nrows+1) # what instance number is this for this user

    # user "random effect": collapse over everything but ... 
    groupUser = ['format','session','client']
    #ud['user_RF'] = ud.groupby(groupUser).mean().reset_index()['error']
    temp = ud.loc[(ud['source']=='train') | (ud['source']=='dev')].groupby(groupUser).mean().reset_index()[groupUser + ['error']]\
        .rename(index=str, columns = {'error': 'userErr_RF'})    
    temp['userErrVar_RF'] = temp['userErr_RF'] * (1-temp['userErr_RF'])
    ud = ud.merge(temp, on = groupUser, how ='left')
    
    # Spacing stuff
    ud['nthOccurance'] = ud.groupby(['token']).cumcount() + 1 # repetitions per token (per user)
    # token spacing
    ud['tokenLag1'] = ud.groupby('token')['days'].diff() # gap between this occurrance and the one before it (1-back)
    ud['tokenLag2'] = ud.groupby('token')['days'].diff(2) - ud['tokenLag1'] # gap between this occurrance and the one two before it (2-back)
    # stem spacing
    ud['stemLag1'] = ud.groupby('stem')['days'].diff() # gap between this occurrance and the one before it (1-back)
    ud['stemLag2'] = ud.groupby('stem')['days'].diff(2) - ud['stemLag1'] # gap between this occurrance and the one two before it (2-back)
    # morpho spacing
    morphoGroup = ['Number','Person','Tense','VerbForm']
    morphoLagGroup = list(set(morphoGroup).intersection(morphos))
    ud['morphoLag1'] = ud.groupby(morphoLagGroup)['days'].diff() # gap between this occurrance and the one before it (1-back)
    ud['morphoLag2'] = ud.groupby(morphoLagGroup)['days'].diff(2) - ud['morphoLag1']
    
    # lag btwn 1st and 2nd occurance:
    # not all tokens occur twice so error could be thrown if not careful - get counts
    tokenCounts = ud.groupby('stem').size().reset_index(name='counts')
    multiTokens = tokenCounts.loc[tokenCounts['counts']>1] # get subset of tokens that occur more than once
    temp = ud.loc[(ud['stem'].isin(multiTokens['stem']))]
    temp['lagTr1Tr2'] = temp.groupby('stem')['days'].transform(lambda x: x.iloc[1]) -\
        temp.groupby('stem')['days'].transform(lambda x: x.iloc[0])
    ud = ud.merge(temp[['stem','lagTr1Tr2']], on = 'stem', how='left').drop_duplicates()
    temp=None

 
    
    #Previous exercise
    temp = ud.groupby(['exerciseID','format']).first().reset_index().sort_values(by=['origRow_doNotUse'])
    temp['prevFormat'] = temp.format.shift(1)
    ud = ud.merge(temp[['exerciseID','prevFormat']], on = 'exerciseID', how = 'left')
    temp = None
    
    # encode categorical feature interactions -- to be one-hot-encoded later
    ud['format:prevFormat'] = ud['format'] + ':' + ud['prevFormat']
    ud['format:client'] = ud['format'] + ':' + ud['client']

    # do per-exercise stuff    
    #_, idx = np.unique(ud['exerciseID'], return_index=True) # not sure what _ does
    exercises = np.unique(ud['exerciseID'])
    to_return = []
    #for i in tqdm(range(len(exercises))):
    for i in range(len(exercises)):
        to_return.append(
            prepExerDat(ud[ud['exerciseID'] == exercises[i]])) # process this exercise and add it to a list
    pd.options.mode.chained_assignment = 'warn'  # default='warn' # reset the stupid warning thing

    return(pd.concat(to_return))

def getLangName(l2):
    return({
            'sp': 'spanish',
            'fr': 'french',
            'en': 'english'
        }[l2]
          )
#Check for interlingual homographs
def check_in_words(x):
    if x in words.words():
        return 1
    else: return 0
    
def interaction(df,cols):
    for tup in cols:
        df[":".join(tup)] = df[tup].prod(axis=1, skipna=False)
    return df
                                       
def catInteraction(df,cols):
    for tup in cols:
        temp = pd.get_dummies(df[tup[0]],prefix = tup[1], prefix_sep = ':{'+tup[0]+'}')
        df = df.join(temp.multiply(df[tup[1]], axis="index"))
    return df 

# A function that takes a full langauge's df as input and adds the various columns we'll want 
def prepData(td, l2): # call the arg 'td 'for compatability with existing code 
    lang = getLangName(l2)
    stemmer = SnowballStemmer(lang) #Set stemmer language
    if lang == 'french':
        morphos = ['Definite','Gender','Mood','Number','Person','PronType','Tense','VerbForm'] #french
    elif lang == 'spanish': 
        morphos = ['Case','Definite','Degree','Foreign','NumType','Gender','Mood','Number','Person','Polite','Poss','PrepCase','PronType','Reflex','Tense','VerbForm'] #spanish
    elif lang == 'english':
        morphos = ['Case','Definite','Degree','Gender','Mood','Number','NumType'] #english
    else: print("Error in building morphological list")

    print('Processing users learning ' + lang + '...')
    print('   Adding dataframe-level features ...', end ='')
    td['stem'] = [stemmer.stem(x) for x in td.token] # Add column for word stem
    # do stuff that affects the whole df (e.g. split morpho col)
    # recode client (b/c we don't care about android vs. ios diff)
    replaceClient = {'client': {'ios': 'mobile',
                           'android':'mobile'}}
    td.replace(replaceClient, inplace = True)
    
    td['origRow_doNotUse'] = np.arange(1, td.shape[0]+1) # to make sure no shuffling is going on
    td = pd.concat([td.drop(['morpho'], axis=1),td['morpho'].apply(pd.Series)], axis=1)
    td['token'] = td['token'].str.lower()
    #Get word length for each token
    td['wordLength'] = [len(x) for x in td.token]
    lexGroupToken = ['format','token', 'stem', 'pos']
    #Calculate 'random effects' across tokens
    temp = td.loc[(td['source']=='train') | (td['source']=='dev')].groupby(lexGroupToken)\
        .mean().reset_index()[lexGroupToken + ['error']]\
        .rename(index=str, columns = {'error': 'tokenErr_RF'})
    temp['tokenErrVar_RF'] = temp['tokenErr_RF'] * (1-temp['tokenErr_RF'])
    td = td.merge(temp, on = lexGroupToken, how='left')
    temp = None
    print('Complete. \nNow processing user- & exercise-level data...')
    # do per-user stuff: 
    # found below trick here: https://stackoverflow.com/questions/15637336/numpy-unique-with-order-preserved
    #_, idx = np.unique(td['user'], return_index=True) # not sure what _ does
    users = np.unique(td['user']) #<-- might not preserve order?
    to_return = []
    for i in range(len(users)):
    #split up users and process them one at a time
        to_return.append(
            prepUserDat(td.loc[td.user == users[i]], morphos)) # process user's dat & add it to a list
    print('Merging users\' data & gathering orthographic info... ')
    td = pd.concat(to_return)
    orthoinfo = pd.read_csv('../Data/'+lang+'.csv', encoding='ISO-8859-1')
    td = td.merge(orthoinfo, on='token', how='left')
    
    print('Calculating morphological complexity')
    temp = td[morphos]
    td['morphoComplexity'] = (len(temp.columns) - temp.isnull().sum(axis=1))/len(temp.columns)
    temp = None

    temp = pd.read_csv('../Data/'+lang+'Dict.csv', encoding='utf-16')
    temp['token'] = temp['token_lower']
    td = td.merge(temp.drop(['token_lower'], axis=1), on='token', how='left')
    temp = None
    
    print('Transforming, imputing missing values, and z-scoring numeric columns...', end = '')
    # Log transform where appropriate
    sec = 1/60/60/24 # one second in units of days -- many zeros in lags so add 1 sec to each to take log
    td['tokenLag1'] = np.log10(td['tokenLag1'] + sec)
    td['tokenLag2'] = np.log10(td['tokenLag2'] + sec)
    td['stemLag1'] = np.log10(td['stemLag1'] + sec)
    td['stemLag2'] = np.log10(td['stemLag2'] + sec)
    #td['morphoLag1'] = np.log10(td['morphoLag1'] + sec)
    #td['morphoLag2'] = np.log10(td['morphoLag2'] + sec) 
    td['logWordFreq'] = np.log10(td['WordFreq'])
    td['log?'] = np.log10(td['?'])
    td['log?Pho'] = np.log10(td['?Pho'])
    td['log?EngPho'] = np.log10(td['?EngPho']) # is it still called that in the English data?

    #replace NaNs in continuous columns with the mean & z-score them
    continuousCols = list(td.select_dtypes(include=[np.number]))
    continuousCols.remove('error')
    #continuousCols.remove('acc')
    continuousCols.remove('Homograph')
    continuousCols.remove('origRow_doNotUse')
    td[continuousCols] = td[continuousCols] \
       .apply(lambda x: x.replace(np.NaN, np.nanmean(x), axis = 0))#\
#        .apply(zscore)
    
    # create interaction columns for continuous features
    
    interCols = [
        ['stemLag1','stemLag2'],
        #['stemLag1','stemLag2','lagTr1Tr2'],
        #['lagTr1Tr2','morphoComplexity'],
        #['morphoLag1','morphoComplexity'],
    ]
    td = interaction(td,interCols)
    
    # now for categoritcal * continuous
    catInterCols = [
        ['format', 'PhonNei'],
        ['format', 'OrthoNei'],
        ['pos', 'morphoComplexity']
    ]
    td = catInteraction(td,catInterCols)
    
    # now one-hot encode the categoricals & remove the originals from the df
    catCols = ['countries', 'client','session', 'format','pos', 'format:prevFormat', 'format:client']+ morphos
    for col in catCols:
        oneHotCols = pd.get_dummies(td[col], prefix = '{'+col+'}', prefix_sep='')
        td = td.drop([col],axis=1).join(oneHotCols)
                                       
    print('Complete.')
    print('All done!')
    return(td)

In [87]:
# Process all 3 languages
for l2 in langs:
    data[l2+'_df'] = prepData(data[l2].copy(), l2).sort_values(by='origRow_doNotUse') # send a copy so that original data is preserved 

Processing users learning spanish...
   Adding dataframe-level features ...Complete. 
Now processing user- & exercise-level data...
Merging users' data & gathering orthographic info... 
Calculating morphological complexity
Transforming, imputing missing values, and z-scoring numeric columns...

  warn('the "axis" argument is deprecated and will be removed in'


Complete.
All done!


In [30]:
with open('../Data/DuoData_processed_'+l2+'.pickle', 'wb') as handle:
    pickle.dump(data, handle)