# Preprocessing Human chat 

In [85]:
import pandas as pd
import numpy as np

# Reading the File

In [86]:
with open('human_chat.txt', 'r') as file:
    lines = file.readlines()

In [87]:
len(lines)

1495

# Make data

In [88]:
df = pd.DataFrame(lines,columns = ['text'])

In [89]:
df.head()

Unnamed: 0,text
0,Human 1: Hi!\n
1,Human 2: What is your favorite holiday?\n
2,Human 1: one where I get to meet lots of diffe...
3,Human 2: What was the most number of people yo...
4,Human 1: Hard to keep a count. Maybe 25.\n


In [90]:
df['p1'] = df['text']
df['p2'] = df['text'].shift(-1)

In [91]:
df.shape

(1495, 3)

In [92]:
# who to whom
df['said'] = df.p1.apply(lambda x: (x.split(':')[0]).split(' ')[-1])
df['to'] = df.p2.apply(lambda x: (x.split(':')[0]).split(' ')[-1] if type(x)==str else x)

In [93]:
df.head(5)

Unnamed: 0,text,p1,p2,said,to
0,Human 1: Hi!\n,Human 1: Hi!\n,Human 2: What is your favorite holiday?\n,1,2
1,Human 2: What is your favorite holiday?\n,Human 2: What is your favorite holiday?\n,Human 1: one where I get to meet lots of diffe...,2,1
2,Human 1: one where I get to meet lots of diffe...,Human 1: one where I get to meet lots of diffe...,Human 2: What was the most number of people yo...,1,2
3,Human 2: What was the most number of people yo...,Human 2: What was the most number of people yo...,Human 1: Hard to keep a count. Maybe 25.\n,2,1
4,Human 1: Hard to keep a count. Maybe 25.\n,Human 1: Hard to keep a count. Maybe 25.\n,Human 2: Which holiday was that?\n,1,2


# fixing a dtype

In [94]:
df.said.unique(), df.to.unique()

(array(['1', '2', '\n'], dtype=object),
 array(['2', '1', '\n', None], dtype=object))

There is also a null value in the last row due to value shift. (fill it with 3)

In [95]:
print(df.to.isnull().sum())
print(df[df['to'].isnull()].index.tolist())


1
[1494]


In [96]:
# replacing \n to 3
df[df['said']=='\n'].index, df[df['to']=='\n'].index

(Int64Index([919], dtype='int64'), Int64Index([918], dtype='int64'))

In [97]:
df.at[919, 'said'] = 9
df.at[918, 'to'] = 9
df.at[1494, 'to'] = 9


In [98]:
df['to'] = pd.to_numeric(df['to'])
df['said'] = pd.to_numeric(df['said'])

In [99]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1495 entries, 0 to 1494
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    1495 non-null   object
 1   p1      1495 non-null   object
 2   p2      1494 non-null   object
 3   said    1495 non-null   int64 
 4   to      1495 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 58.5+ KB


In [100]:
df.said.unique(), df.to.unique()

(array([1, 2, 9]), array([2, 1, 9]))

======================

In [101]:
df['status'] = df['said'] - df['to']

In [102]:
df.head()

Unnamed: 0,text,p1,p2,said,to,status
0,Human 1: Hi!\n,Human 1: Hi!\n,Human 2: What is your favorite holiday?\n,1,2,-1
1,Human 2: What is your favorite holiday?\n,Human 2: What is your favorite holiday?\n,Human 1: one where I get to meet lots of diffe...,2,1,1
2,Human 1: one where I get to meet lots of diffe...,Human 1: one where I get to meet lots of diffe...,Human 2: What was the most number of people yo...,1,2,-1
3,Human 2: What was the most number of people yo...,Human 2: What was the most number of people yo...,Human 1: Hard to keep a count. Maybe 25.\n,2,1,1
4,Human 1: Hard to keep a count. Maybe 25.\n,Human 1: Hard to keep a count. Maybe 25.\n,Human 2: Which holiday was that?\n,1,2,-1


In [103]:
df['status'].unique()

array([-1,  1,  0, -8,  8])

In [104]:
df['status'].value_counts()

-1    730
 1    730
 0     32
-8      2
 8      1
Name: status, dtype: int64

Here we will only select conversation where speaker 1 is talking to speaker 2, Because this is basically one way conversation (like you and your crush)

In [105]:
df.head(62)

Unnamed: 0,text,p1,p2,said,to,status
0,Human 1: Hi!\n,Human 1: Hi!\n,Human 2: What is your favorite holiday?\n,1,2,-1
1,Human 2: What is your favorite holiday?\n,Human 2: What is your favorite holiday?\n,Human 1: one where I get to meet lots of diffe...,2,1,1
2,Human 1: one where I get to meet lots of diffe...,Human 1: one where I get to meet lots of diffe...,Human 2: What was the most number of people yo...,1,2,-1
3,Human 2: What was the most number of people yo...,Human 2: What was the most number of people yo...,Human 1: Hard to keep a count. Maybe 25.\n,2,1,1
4,Human 1: Hard to keep a count. Maybe 25.\n,Human 1: Hard to keep a count. Maybe 25.\n,Human 2: Which holiday was that?\n,1,2,-1
...,...,...,...,...,...,...
57,Human 2: it is pretty dry in the mountains I b...,Human 2: it is pretty dry in the mountains I b...,Human 1: hmm.... okay. Climate change talk thi...,2,1,1
58,Human 1: hmm.... okay. Climate change talk thi...,Human 1: hmm.... okay. Climate change talk thi...,"Human 2: nope, what does it say?\n",1,2,-1
59,"Human 2: nope, what does it say?\n","Human 2: nope, what does it say?\n",Human 1: they were talking about how AI is hel...,2,1,1
60,Human 1: they were talking about how AI is hel...,Human 1: they were talking about how AI is hel...,Human 1: Hi.\n,1,1,0


In [124]:
data = df[df.status==-1]

In [125]:
data.shape

(730, 6)

In [126]:
data.head()

Unnamed: 0,text,p1,p2,said,to,status
0,Human 1: Hi!\n,Human 1: Hi!\n,Human 2: What is your favorite holiday?\n,1,2,-1
2,Human 1: one where I get to meet lots of diffe...,Human 1: one where I get to meet lots of diffe...,Human 2: What was the most number of people yo...,1,2,-1
4,Human 1: Hard to keep a count. Maybe 25.\n,Human 1: Hard to keep a count. Maybe 25.\n,Human 2: Which holiday was that?\n,1,2,-1
6,Human 1: I think it was Australia\n,Human 1: I think it was Australia\n,Human 2: Do you still talk to the people you m...,1,2,-1
8,Human 1: Not really. The interactions are usua...,Human 1: Not really. The interactions are usua...,"Human 2: Yea, me too. I feel like God often pu...",1,2,-1


In [127]:
data = data[['p1', 'p2']]

In [128]:
data

Unnamed: 0,p1,p2
0,Human 1: Hi!\n,Human 2: What is your favorite holiday?\n
2,Human 1: one where I get to meet lots of diffe...,Human 2: What was the most number of people yo...
4,Human 1: Hard to keep a count. Maybe 25.\n,Human 2: Which holiday was that?\n
6,Human 1: I think it was Australia\n,Human 2: Do you still talk to the people you m...
8,Human 1: Not really. The interactions are usua...,"Human 2: Yea, me too. I feel like God often pu..."
...,...,...
1484,Human 1: I'm looking through some old photos o...,Human 2: I feel you. Memeories always flood wh...
1486,"Human 1: It's called <REDACTED_TERM>, on the n...",Human 2: Hope you can take some time to visit ...
1488,"Human 1: Yes, that would be nice. Do you like ...",Human 2: Yes I do. But haven’t been to many pl...
1490,"Human 1: Where would you most like to go, if y...",Human 2: Fly to the moon :) Haha\n


In [129]:
aa = 'what : is the : name : gooog.'
print(' '.join(aa.split(':')[1:]))

 is the   name   gooog.


In [130]:
data['p1'] = data['p1'].apply(lambda x : ' '.join(x.split(':')[1:]))
data['p2'] = data['p2'].apply(lambda x : ' '.join(x.split(':')[1:]))

In [131]:
data

Unnamed: 0,p1,p2
0,Hi!\n,What is your favorite holiday?\n
2,one where I get to meet lots of different peo...,What was the most number of people you have e...
4,Hard to keep a count. Maybe 25.\n,Which holiday was that?\n
6,I think it was Australia\n,Do you still talk to the people you met?\n
8,Not really. The interactions are usually shor...,"Yea, me too. I feel like God often puts stran..."
...,...,...
1484,I'm looking through some old photos of my hom...,I feel you. Memeories always flood when looki...
1486,"It's called <REDACTED_TERM>, on the north coa...",Hope you can take some time to visit pretty s...
1488,"Yes, that would be nice. Do you like travelli...",Yes I do. But haven’t been to many places yet\n
1490,"Where would you most like to go, if you could?\n",Fly to the moon ) Haha\n


In [132]:
data['p1'] = data['p1'].apply(lambda x : x.replace("\n", ""))
data['p2'] = data['p2'].apply(lambda x : x.replace("\n", ""))

In [133]:
data.head()

Unnamed: 0,p1,p2
0,Hi!,What is your favorite holiday?
2,one where I get to meet lots of different peo...,What was the most number of people you have e...
4,Hard to keep a count. Maybe 25.,Which holiday was that?
6,I think it was Australia,Do you still talk to the people you met?
8,Not really. The interactions are usually shor...,"Yea, me too. I feel like God often puts stran..."


In [134]:
data.columns = ['col1', 'col2']

In [136]:
data.to_csv('human_dataset.csv')