In [87]:
import json
import nltk
import re
import pandas as pd
from tqdm import tqdm

pd.set_option('display.max_colwidth', -1)

In [88]:
df = pd.read_csv("data/train_data.csv")
df = df.drop(columns = ["Unnamed: 0", "path", "speakerId", "action", "object", "location"])
df

Unnamed: 0,transcription
0,Change language
1,Resume
2,Turn the lights on
3,Switch on the lights
4,Switch off the lights
...,...
23132,Where is my phone
23133,How is my oven
23134,When is the event
23135,Why is the oven on


In [89]:
df_test = pd.read_csv("data/test_data.csv")
df_test = df_test.drop(columns = ["Unnamed: 0", "path", "speakerId", "action", "object", "location"])
df_test

Unnamed: 0,transcription
0,Turn on the lights
1,Turn off the lights
2,Change language
3,Pause the music
4,Resume
...,...
3788,I need to practice my Chinese. Switch the language
3789,I need to practice my German. Switch the language
3790,I need to practice my Korean. Switch the language
3791,I need to practice my English. Switch the language


In [90]:
df_valid = pd.read_csv("data/valid_data.csv")
df_valid = df_valid.drop(columns = ["Unnamed: 0", "path", "speakerId", "action", "object", "location"])
df_valid

Unnamed: 0,transcription
0,Turn on the lights
1,Turn off the lights
2,Change language
3,Pause the music
4,Resume
...,...
3113,Lights on
3114,Switch off the lights
3115,Turn the lights off
3116,Lights off


### Split the words

In [91]:
# Split words
"""
questions_raw = [obj[str(conv)]['content'][0]['message'].split() for conv in obj]
"""
def split_words (df):
    questions_raw = []

    for conv in df.iterrows():
        utterance = conv[1]['transcription']
        questions_raw.append(utterance.split(" "))
    return questions_raw


In [92]:
questions_raw = split_words(df)
test_raw = split_words(df_test)
valid_raw = split_words(df_valid)

print(questions_raw[0])
print(test_raw[0])
print(valid_raw[0])

['Change', 'language']
['Turn', 'on', 'the', 'lights']
['Turn', 'on', 'the', 'lights']


### Remove punctuation

In [93]:

def remove_punctuation (split_questions):
    questions = []
    # Remove all punctuation
    for question in split_questions:
        q = []
        for word in question:
            s = re.sub(r'[^\w\s]','',word)
            q.append(s)

        questions.append(q)
    return questions


In [94]:
question = remove_punctuation(questions_raw)
question_test = remove_punctuation(test_raw)
question_valid = remove_punctuation(valid_raw)


print(question[0])
print(question_test[0])
print(question_valid[0])

['Change', 'language']
['Turn', 'on', 'the', 'lights']
['Turn', 'on', 'the', 'lights']


### Create a dictionary of nouns

In [95]:
def create_dictionary (questions_list):

    nouns = []
    is_noun = lambda pos: pos[:2] == 'NN'

    # Loop borrowed from https://stackoverflow.com/questions/33587667/extracting-all-nouns-from-a-text-file-using-nltk
    for questions in questions_list:
        my_nouns = []
        
        for i, question in enumerate(questions):

            for word in question:
                token = nltk.pos_tag(nltk.word_tokenize(str(word)))

                for word,wordtype in token:
                    if (wordtype == 'NNS' or wordtype == 'NN'):
                        my_nouns.append(word)
        nouns = nouns + my_nouns
    
    # Remove duplicates
    nouns = list(set(nouns))
    return nouns

#### Remove duplicates

In [96]:
nouns = create_dictionary([question, question_test, question_valid])

In [97]:
print(nouns)

['Turn', 'phones', 'phone', 'turn', 'music', 'switch', 'Increase', 'system', 'languages', 'please', 'kitchen', 'Bathroom', 'shoes', 'Pause', 'Far', 'sound', 'event', 'room', 'Decrease', 'Use', 'Too', 'Switch', 'couldnt', 'Thats', 'bedroom', 'lights', 'need', 'heat', 'mute', 'newspaper', 'Louder', 'Language', 'cooler', 'video', 'Change', 'Volume', 'hear', 'Lamp', 'Heat', 'audio', 'hotter', 'OK', 'softer', 'Bedroom', 'temperature', 'Start', 'Washroom', 'loud', 'juice', 'anything', 'socks', 'lamp', 'device', 'Fetch', 'cant', 'language', 'increase', 'practice', 'heating', 'quieter', 'bathroom', 'decrease', 'louder', 'volume', 'Quieter', 'levels', 'settings', 'Lights', 'Set', 'washroom', 'max']


## Create dataset

#### Populate dataset

In [98]:
def create_dataset (questions):

    my_df = []

    for i, question in enumerate(questions):
        w = question[-1:][0]

        if (i % 2) == 0:
            sentence = ' '.join(question[:-1])
            target = 'incomplete'

            if (question[-1:] == ["on"] or question[-1:] == ["off"] or question[-1:] == ["up"]
               or question[-1:] == ["down"] or question[-1:] == ["in"] or question[-1:] == ["out"]):
                sentence =  ' '.join(question[:-2])

        else:
            sentence = ' '.join(question)
            target = 'complete'

        my_df.append([sentence, target])
        
    return my_df


In [99]:
data_train = create_dataset(question)
data_test = create_dataset(question_test)
data_valid = create_dataset(question_valid)

In [100]:
def create_df (data):
    
    header =  ['sentence', 'target']
    return pd.DataFrame(data=data, columns = header)    
    
df_train = create_df(data_train)
df_test = create_df(data_test)
df_valid = create_df(data_valid)

df_test.head()

Unnamed: 0,sentence,target
0,Turn on the,incomplete
1,Turn off the lights,complete
2,Change,incomplete
3,Pause the music,complete
4,,incomplete


In [101]:
"""
# Remove duplicates
df.drop_duplicates(subset ="sentence", 
                     keep = 'first', inplace = True)

df.head(30)
"""

'\n# Remove duplicates\ndf.drop_duplicates(subset ="sentence", \n                     keep = \'first\', inplace = True)\n\ndf.head(30)\n'

In [102]:
def save_sentence_txt(name, df):
    myfile = open('EOT_DATA/{}.txt'.format(name), 'w')

    for index, row in df.iterrows():
        myfile.write("{} \n".format(row['sentence']))

    myfile.close()
    
save_sentence_txt("eot_sentences", df_train)
save_sentence_txt("eot_sentences_test", df_test)
save_sentence_txt("eot_sentences_valid", df_valid)

In [103]:
def save_target_txt(name, df):
    myfile = open('EOT_DATA/{}.txt'.format(name), 'w')
    
    for index, row in df.iterrows():
        myfile.write("%s\n" % row['target'])

    myfile.close()
    
save_target_txt("eot_target", df_train)
save_target_txt("eot_target_test", df_test)
save_target_txt("eot_sentences_valid", df_valid)