In [None]:
import pandas 
import numpy as np 
import gc
import random
from collections import OrderedDict

MINIMUM_ROWS_PER_SEQUENCE = 6 # this includes the output
MAXIMUM_TIMESTAMP_ERROR = 60*5 # number of seconds of error 
SEQUENCE_WITH_PADDING_LEN = 20 # we handle multiple sizes

In [None]:
# mount it
from google.colab import drive
drive.mount('/content/drive')
# copy it there
!cp /content/drive/MyDrive/duolingo/dataverse_files.zip /content/
!unzip /content/dataverse_files.zip

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Archive:  /content/dataverse_files.zip
  inflating: opensource_dataset_difficulty.tsv.7z  
  inflating: opensource_dataset_forgetting_curve.tsv.7z  
  inflating: opensource_dataset_raw.tsv.7z  


In [None]:
!7z e opensource_dataset_difficulty.tsv.7z


7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU @ 2.20GHz (406F0),ASM,AES-NI)

Scanning the drive for archives:
  0M Scan         1 file, 126353 bytes (124 KiB)

Extracting archive: opensource_dataset_difficulty.tsv.7z
--
Path = opensource_dataset_difficulty.tsv.7z
Type = 7z
Physical Size = 126353
Headers Size = 170
Method = LZMA2:19
Solid = -
Blocks = 1

  0%    Everything is Ok

Size:       478930
Compressed: 126353


In [None]:
difficulty = dataset = pandas.read_table("/content/opensource_dataset_difficulty.tsv")

In [None]:
difficulty

In [None]:
dataset = pandas.read_table("/content/opensource_dataset.tsv", nrows=1000000)
dataset.head()

Unnamed: 0,u,w,i,t_history,r_history,delta_t,r
0,2c5a94,claim,3,1,0,1,1
1,2c5a94,affiliate,3,1,0,1,1
2,2c5a94,likely,2,0,0,1,1
3,2c5a94,dessert,3,1,0,1,1
4,2c5a94,invaluable,3,1,0,1,1


In [None]:
diff_map = {}

for i, row in difficulty.iterrows():
  diff_map[row['w']] = row['d']

In [None]:
dataset['d'] = dataset['w']

In [None]:
dataset['d'] = dataset['d'].map(diff_map)

In [None]:
# The dataset is highly unbalanced
# to fix this we are going to do some data augmentation later
# for now, let's just remove some columns

del dataset["lexeme_string"]
del dataset["learning_language"]
del dataset["ui_language"]

In [None]:
# we now group by (user_id, lexeme_id) and assign to each class a set ordered by timestamp
 
dataset = dataset.sort_values(['user_id','lexeme_id', 'timestamp']) #.groupby(['user_id', 'lexeme_id']) 


1819

In [None]:
dataset.sort_values(['u', 'w', 't_history']).head()

Unnamed: 0,u,w,i,t_history,r_history,delta_t,r,d
491677,0001d9,abrupt,16,0111111136134717,000100111011110,1,1,8.0
491653,0001d9,abundant,21,01313135211313513513,01010110001011011011,4,1,5.0
491648,0001d9,accommodate,23,0111111111113131311111,0000000000010101000000,1,1,7.0
491670,0001d9,accommodations,25,011242413131131111131111,001101010100100000100001,3,0,3.0
491673,0001d9,ambitious,6,01121,00101,2,1,4.0


In [None]:
data = []

for i, row in dataset.iterrows():
  if len(row['t_history'].split(',')) >= 6:
    data.append(row)

In [None]:
X = []
Y = []
diff = []

for line in data:
  r_history = line['r_history'].split(',') 
  r_history.append(line['r'])
  t_history = line['t_history'].split(',')
  t_history.append(line['delta_t'])
  row = np.column_stack((r_history, t_history))
  seq = []

  while len(seq) < SEQUENCE_WITH_PADDING_LEN-len(row):
    seq.append(np.array([-1, -1]))
  
  i = max(0, len(row)-SEQUENCE_WITH_PADDING_LEN)
  while i < len(row):
    seq.append([old_prob, row[i][1]])
    old_prob = row[i][0]
    i += 1
  
  X.append(np.array(seq, dtype='float32'))
  Y.append(old_prob)

X = np.array(X, dtype='float32')
Y = np.array(Y, dtype='float32')

In [None]:
from sklearn.utils import shuffle
X, Y = shuffle(X, Y, random_state=0)

np.save('processed_data.npy', X)
np.save('processed_labels.npy', Y)

! gzip /content/processed_data.npy
! gzip /content/processed_labels.npy

In [None]:
done = (-1, -1) 
arr_dataset = []
for index, row in dataset.iterrows():
    if (row["user_id"], row["lexeme_id"]) != done:
        if len(arr_dataset) > 0 and len(arr_dataset[-1]) < MINIMUM_ROWS_PER_SEQUENCE:
            del arr_dataset[-1]

        # TO REMOVE
        if len(arr_dataset) > 60000:
            break

        arr_dataset.append([]) 
        done=(row["user_id"], row["lexeme_id"])
    arr_dataset[-1].append(row.tolist())  

del dataset 
gc.collect()

0

In [None]:
# full sequences
augmented_data = []
labels = []

for row in arr_dataset:
  start = 0
  seq = []
  old_prob = row[0][6] / row[0][5]
  seq.append([0.5, 0, row[0][5]])
  i = 0

  while len(seq) < SEQUENCE_WITH_PADDING_LEN-len(row):
    seq.append([-1, -1, -1])
  
  while i < len(row):
    seq.append([old_prob, row[i][2] / 86400, row[i][7]])
    old_prob = row[i][0]
    i += 1
    
  augmented_data.append(np.array(seq, dtype='float32'))
  labels.append(row[len(row)-1][0])

In [None]:
augmented_data = []
labels = []

for row in arr_dataset:
  start = 0
  
  for end in range(MINIMUM_ROWS_PER_SEQUENCE, len(row)+1):
    seq = []
    label = []
    old_prob = row[start][6] / row[start][5]
    seq.append([0.5, 0, row[start][5]])
    label.append(old_prob)

    for i in range(start, end):

      seq.append([old_prob, row[i][2] / 86400, row[i][7]])
      old_prob = row[i][0]
      label.append(old_prob)
    
    augmented_data.append(np.array(seq, dtype='float32'))
    labels.append(np.array(label, dtype='float32'))
    start += 1

# balancing dataset ---
augmented_data = np.array(augmented_data, dtype='float32')
labels = np.array(labels, dtype='float32')

from sklearn.utils import shuffle
augmented_data, labels = shuffle(augmented_data, labels, random_state=0)

ones_x = augmented_data[np.where(labels[:,6] == 1)][:len(augmented_data[np.where(labels[:,6] != 1)])]
ones_y = labels[np.where(labels[:,6] == 1)][:len(augmented_data[np.where(labels[:,6] != 1)])]

augmented_data = np.delete(augmented_data, np.where(labels[:,6] == 1), axis=0)
labels = np.delete(labels, np.where(labels[:,6] == 1), axis=0)

augmented_data = np.concatenate((augmented_data, ones_x), axis=0)
labels = np.concatenate((labels, ones_y), axis=0)

augmented_data, labels = shuffle(augmented_data, labels, random_state=0)
# ---


np.save('processed_data.npy', augmented_data)
np.save('processed_labels.npy', labels)

! gzip /content/processed_data.npy
! gzip /content/processed_labels.npy

In [None]:
print(np.array(arr_dataset[:100]))
# TO REMOVE, for now we handle just 0/1

'''
for i in range(len(arr_dataset)):
    for j in range(len(arr_dataset[i])):
        arr_dataset[i][j][0] = (1 if arr_dataset[i][j][0] == 1 else 0)
'''
#count = sum([(1 if row[5][0] == 0.0 else 0) for row in arr_dataset])
#print(count)

In [None]:
# we now generate more data using the sliding window technique
augmented_data = []
dataset = []
index = 0
for row in arr_dataset:
    if index % 10000 == 0:
        print(str(index), "/", str(len(arr_dataset)))
    index += 1
    for end_index in range(MINIMUM_ROWS_PER_SEQUENCE, len(row)+1):
        new_seq = []
        start_index = end_index-SEQUENCE_WITH_PADDING_LEN if end_index > SEQUENCE_WITH_PADDING_LEN else 0
        for i in range(start_index, end_index):
            if i > start_index and abs((row[i][1]-row[i][2])-row[i-1][1]) > MAXIMUM_TIMESTAMP_ERROR:
                break
            new_seq.append(row[i])
        if len(new_seq) >= MINIMUM_ROWS_PER_SEQUENCE:
            while len(new_seq) < SEQUENCE_WITH_PADDING_LEN:
                copied = [-1 for i in range(len(new_seq[0]))] 
                new_seq.append(copied)
            dataset.append(new_seq)

random.shuffle(dataset)
del augmented_data
gc.collect()

0 / 30001
10000 / 30001
20000 / 30001
30000 / 30001


0

In [None]:
dd = dict()
for i in dataset:
  if len(i) not in dd:
    dd[len(i)] = 0
  dd[len(i)] += 1


for i in range(6, 100):
    print(str(i), ": ", dd[i])

In [None]:
temporary = [] 

total_1 = sum([(row[-1][0]) for row in dataset])
total_0 = len(dataset)-total_1
total_1 = 0

for row in dataset:
    if row[-1][0] != 1:
        temporary.append(row)
    #else:
    elif total_1 < total_0:
        temporary.append(row)
        total_1 += 1

# we now use numpy instead of lists
# lists were useful for fast appending, but now we can continue with numpy

random.shuffle(dataset)
dataset = np.array(temporary)
del temporary
gc.collect()

In [None]:
count_0 = 0
count_1 = 0
for i in range(len(dataset)):
    if dataset[i][-1][0] == '0.0':
        count_0 += 1
    elif dataset[i][-1][0] == '1.0':
        count_1 += 1

print(len(dataset), "\n")
print(count_0, "\n")
print(count_1, "\n")

In [None]:
np.save('processed.npy', to_export)

In [None]:

# we do not need anymore some columns: timestamp, user, word 
dataset_to_save = np.delete(dataset, [1,3,4], axis=2) 
np.random.shuffle(dataset_to_save)
np.save('processed.npy', dataset_to_save)

In [None]:
np.random.shuffle(dataset_to_save)

In [None]:
! gzip /content/processed.npy

In [None]:
#newtest = np.load('processed.npy')