In [1]:
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (20,10)

In [None]:
df = pd.read_csv('../data/avatar.csv', encoding = 'unicode_escape').drop(columns=['Unnamed: 0', 'id'])
df.head()

In [3]:
speakers = df.groupby(['character']).size().sort_values(ascending=False)

In [None]:
plt.bar(x=speakers.index[:20], height=speakers.values[:20])
plt.xticks(rotation=-45)
plt.title('20 most common speakers')
plt.ylabel('Number of quotes')
plt.xlabel('Speaker name')
plt.show()

# Preprocessing
The plot above shows that there is character named "Scene Description". Scene descriptions are useless for the sake of test (speach) generation. We verify its participation in the whole dataset.

In [None]:
total = df.shape[0]
characters = df[df['character'] != 'Scene Description'].shape[0]
descriptions = df[df['character'] == 'Scene Description'].shape[0]
print(f"{'Total number of expressions:':<30}{total:<10}")
print(f"{'Characters statements:':<30}{characters:<6}( {characters/total * 100:.2f}% )")
print(f"{'Scene descriptions:':<30}{descriptions:<6}( {descriptions/total * 100:.2f}% )")


There are also some troublesome characters in the file:

| occurences    | character         |
|---------------|-------------------|
| 1766          | Aang              |
| 2             | Aang and Sokka    |
| 1             | Aang and Zuko     |
| 1             | Aang:             |
| 1             | Actor Bumi        |
| 5             | Actor Iroh        |
| 2             | Actor Jet         |
| 5             | Actor Ozai        |
| 16            | Actor Sokka       |
| 3             | Actor Toph        |
| 14            | Actor Zuko        |
| 19            | Actress Aang      |
| 10            | Actress Azula     |
| 16            | Actress Katara    |

Hence, we perform some preprocessing, consecutively executing the following steps:
1. Drop scene descriptions.
2. Drop statements spoken by more than 1 character - there is no simple way to assign them to the proper character. Thus, to avoid manual labeling we drop them, because there are only few of thems.
3. Lower all character names.
4. Remove tokens like ":", "actor", "actress" from character names.
5. Transform name to upper case.


In [6]:
df = pd.read_csv('../data/avatar.csv', encoding = 'unicode_escape').drop(columns=['Unnamed: 0', 'id'])

In [7]:
df = df[df['character'] != 'Scene Description']
df = df[~df['character'].str.contains('and')]
df['character'] = df['character'].str.lower()
df['character'] = df['character'].str.replace(':|actor|actress', '', regex=True)
df['character'] = df['character'].str.title()
df = df.reset_index()
df = df.drop(columns=['index'])

In [None]:
df

In [None]:
temp_df = df["character_words"].str.lower().str.replace('[^\w\s]','').str.split(expand=True).stack().value_counts().reset_index()

In [15]:
temp_df[0] = temp_df[0] / sum(temp_df[0])

In [None]:
temp_df