In [None]:
import os

import pandas as pd

import DialoGPT.config as config

In [None]:
os.listdir(config.DATA_PATH)

['.ipynb_checkpoints', 'RickAndMortyScripts.csv']

In [None]:
df = pd.read_csv(config.DATA_PATH/'RickAndMortyScripts.csv')

In [None]:
df.head(10)

Unnamed: 0,index,season no.,episode no.,episode name,name,line
0,0,1,1,Pilot,Rick,Morty! You gotta come on. Jus'... you gotta co...
1,1,1,1,Pilot,Morty,"What, Rick? What’s going on?"
2,2,1,1,Pilot,Rick,"I got a surprise for you, Morty."
3,3,1,1,Pilot,Morty,It's the middle of the night. What are you tal...
4,4,1,1,Pilot,Rick,"Come on, I got a surprise for you. Come on, h..."
5,5,1,1,Pilot,Morty,Ow! Ow! You're tugging me too hard!
6,6,1,1,Pilot,Rick,"We gotta go, gotta get outta here, come on. Go..."
7,7,1,1,Pilot,Rick,"What do you think of this... flying vehicle, M..."
8,8,1,1,Pilot,Morty,"Yeah, Rick... I-it's great. Is this the surprise?"
9,9,1,1,Pilot,Rick,Morty. I had to... I had to do it. I had— I ha...


In [None]:
df.shape

(1905, 6)

Since DialoGpt is very similar to GPT2 which is auto regressive, it means that we need to predict the next token when given a sequence.
We need to create the context for every present word by considering the n number of words that occur before it.

I'll be using a context window of 9 ie one present sentence and 9 previous sentences in the conversation

In [None]:
n = 9

In [None]:
all_context = []
for i, text in enumerate(df.line):
    context = []
    for p in range(i + n, i-1, -1):
        try:
            context.append(df.line[p])
        except KeyError: break
    
    if context != []:
        all_context.append(context)

In [None]:
#export
all_context[:2]

[['Morty. I had to... I had to do it. I had— I had to— I had to make a bomb, Morty. I had to create a bomb.',
  "Yeah, Rick... I-it's great. Is this the surprise?",
  'What do you think of this... flying vehicle, Morty? I built it outta stuff I found in the garage.',
  'We gotta go, gotta get outta here, come on. Got a surprise for you Morty.',
  "Ow! Ow! You're tugging me too hard!",
  'Come on, I got a surprise for you.  Come on, hurry up.',
  "It's the middle of the night. What are you talking about?",
  'I got a surprise for you, Morty.',
  'What, Rick? What’s going on?',
  "Morty! You gotta come on. Jus'... you gotta come with me."],
 ['What?! A bomb?!',
  'Morty. I had to... I had to do it. I had— I had to— I had to make a bomb, Morty. I had to create a bomb.',
  "Yeah, Rick... I-it's great. Is this the surprise?",
  'What do you think of this... flying vehicle, Morty? I built it outta stuff I found in the garage.',
  'We gotta go, gotta get outta here, come on. Got a surprise fo

Create the column names for the new Dataframe

In [None]:
columns = ["present"] + [f"context_{i}" for i in range(n)]

In [None]:
cleaned_df = pd.DataFrame(all_context, columns = columns)

Save this dataframe so we don't have to create it again

In [None]:
cleaned_df.head()

Unnamed: 0,present,context_0,context_1,context_2,context_3,context_4,context_5,context_6,context_7,context_8
0,Morty. I had to... I had to do it. I had— I ha...,"Yeah, Rick... I-it's great. Is this the surprise?","What do you think of this... flying vehicle, M...","We gotta go, gotta get outta here, come on. Go...",Ow! Ow! You're tugging me too hard!,"Come on, I got a surprise for you. Come on, h...",It's the middle of the night. What are you tal...,"I got a surprise for you, Morty.","What, Rick? What’s going on?",Morty! You gotta come on. Jus'... you gotta co...
1,What?! A bomb?!,Morty. I had to... I had to do it. I had— I ha...,"Yeah, Rick... I-it's great. Is this the surprise?","What do you think of this... flying vehicle, M...","We gotta go, gotta get outta here, come on. Go...",Ow! Ow! You're tugging me too hard!,"Come on, I got a surprise for you. Come on, h...",It's the middle of the night. What are you tal...,"I got a surprise for you, Morty.","What, Rick? What’s going on?"
2,We're gonna drop it down there just get a whol...,What?! A bomb?!,Morty. I had to... I had to do it. I had— I ha...,"Yeah, Rick... I-it's great. Is this the surprise?","What do you think of this... flying vehicle, M...","We gotta go, gotta get outta here, come on. Go...",Ow! Ow! You're tugging me too hard!,"Come on, I got a surprise for you. Come on, h...",It's the middle of the night. What are you tal...,"I got a surprise for you, Morty."
3,T-t-that's absolutely crazy!,We're gonna drop it down there just get a whol...,What?! A bomb?!,Morty. I had to... I had to do it. I had— I ha...,"Yeah, Rick... I-it's great. Is this the surprise?","What do you think of this... flying vehicle, M...","We gotta go, gotta get outta here, come on. Go...",Ow! Ow! You're tugging me too hard!,"Come on, I got a surprise for you. Come on, h...",It's the middle of the night. What are you tal...
4,"Come on, Morty. Just take it easy, Morty. It's...",T-t-that's absolutely crazy!,We're gonna drop it down there just get a whol...,What?! A bomb?!,Morty. I had to... I had to do it. I had— I ha...,"Yeah, Rick... I-it's great. Is this the surprise?","What do you think of this... flying vehicle, M...","We gotta go, gotta get outta here, come on. Go...",Ow! Ow! You're tugging me too hard!,"Come on, I got a surprise for you. Come on, h..."


In [None]:
cleaned_df.shape

(1896, 10)

In [None]:
cleaned_df.to_csv(config.DATA_PATH/'cleaned_df_with_contexts.csv', index=False)