In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Importing libraries

In [3]:
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud

Read data

In [4]:
movies = pd.read_csv("../input/harry-potter-movies-dataset/Harry_Potter_Movies/Movies.csv", encoding='latin1')
characters = pd.read_csv("../input/harry-potter-movies-dataset/Harry_Potter_Movies/Characters.csv", encoding='latin1')
dialogues = pd.read_csv("../input/harry-potter-movies-dataset/Harry_Potter_Movies/Dialogue.csv", encoding='latin1')
spells = pd.read_csv("../input/harry-potter-movies-dataset/Harry_Potter_Movies/Spells.csv", encoding='latin1')
places = pd.read_csv("../input/harry-potter-movies-dataset/Harry_Potter_Movies/Places.csv", encoding='latin1')

**Movies**

In [5]:
movies

Removing **','** and **'$'** and converting str to int values

In [6]:
movies['Budget'] = movies['Budget'].str.replace(',','')
movies['Box Office'] = movies['Box Office'].str.replace(',','')
movies['Budget'] = movies['Budget'].str.replace('$','')
movies['Box Office'] = movies['Box Office'].str.replace('$','')

movies['Budget'] = movies['Budget'].astype('int64')
movies['Box Office'] = movies['Box Office'].astype('int64')

Adding column **Profit**

In [7]:
movies['Profit'] = movies['Box Office'] - movies['Budget']
movies

Plotting **Runtime**, **Budget**, **Box Office** and **Profit**

In [41]:
fig, axes = plt.subplots(nrows=4, ncols=1, figsize=(10,15))
plt.suptitle("Movies visualizations", fontsize=15)
sns.barplot(data=movies, x='Runtime', y='Movie Title', ax=axes[0], palette="Pastel2")
sns.barplot(data=movies, x='Budget', y='Movie Title', ax=axes[1], palette="Pastel2")
sns.barplot(data=movies, x='Box Office', y='Movie Title', ax=axes[2], palette="Pastel2")
sns.barplot(data=movies, x='Profit', y='Movie Title', ax=axes[3], palette="Pastel2")
plt.show()

**Characters**

In [9]:
characters.head(5)

In [10]:
characters.shape

Plotting the **Species** of the characters

In [42]:
speciesColor = plt.cm.Pastel2(np.linspace(0, 1, len(characters["Species"].value_counts())))
plt.figure(figsize=(6,6))
plt.title("Species chart")
ax = characters['Species'].value_counts().plot(kind='pie', colors=speciesColor)

Plotting the **Gender** of the characters

In [43]:
genderColor = plt.cm.Pastel2(np.linspace(0, 1, len(characters["Gender"].value_counts())))
plt.figure(figsize=(6,6))
plt.title("Gender chart")
ax = characters['Gender'].value_counts().plot(kind='pie', colors=genderColor)

Plotting the **Houses** of the characters (including Beauxbatons and Durmstrang)

In [13]:
houseColor = ["#ff9191", "#bbff91", "#91c1ff", "#ffeb91", "#ba91ff", "#ffb291"]
plt.figure(figsize=(6,6))
plt.title("House chart")
ax = characters['House'].value_counts().plot(kind='pie', colors=houseColor)

**Dialogues**

In [26]:
dialogues.head(5)

In [27]:
dialogues.shape

Merging **Characters** and **Dialogues** based on **Character ID**

In [32]:
characterDialogues = dialogues.merge(characters, how = 'inner', on = ['Character ID'])
characterDialogues

Dialogue counts of character dialogues

In [48]:
characterDialogues['Character Name'].value_counts()

In [50]:
characterDialogues.rename(columns={'Character Name':'CharacterName'}, inplace=True)

Dialogue distribution of characters

In [55]:
plt.figure(figsize=(10,8))
sns.set_style('whitegrid')
sns.countplot(y='CharacterName', data=characterDialogues, order=characterDialogues.CharacterName.value_counts().iloc[:20].index, palette="Pastel1")
plt.xlabel('Number of lines of dialogue')
plt.ylabel('Character')
plt.title('Dialogue Distribution')
plt.show()

Entire dialogue wordcloud

In [57]:
text = " ".join(line for line in characterDialogues['Dialogue'])
wordCloud = WordCloud(width=1000, height=1000, background_color="white", min_font_size=15).generate(text)
plt.figure(figsize = (10,10))
plt.imshow(wordCloud)
plt.axis("off")
plt.show()

Harry Potter's dialogue wordcloud

In [59]:
harry = characterDialogues[characterDialogues['CharacterName']=='Harry Potter']

In [60]:
harryText = " ".join(line for line in harry['Dialogue'])
wordCloud = WordCloud(width=1000, height=1000, background_color="white", min_font_size=15).generate(harryText)
plt.figure(figsize = (10,10))
plt.imshow(wordCloud)
plt.axis("off")
plt.show()

**Spells**

In [61]:
spells.head(5)

In [62]:
spells.shape

Plotting the **Light** emitted by respective spells

In [63]:
spellsColor = plt.cm.Pastel2(np.linspace(0, 1, len(spells["Light"].value_counts())))
plt.figure(figsize=(6,6))
plt.title("Lights chart")
ax = spells['Light'].value_counts().plot(kind='pie', colors=spellsColor)

**Places**

In [64]:
places.head(5)

In [65]:
places.shape

Plotting the most visited places 

In [66]:
placesColor = plt.cm.Pastel2(np.linspace(0, 1, len(places['Place Category'].value_counts())))
plt.figure(figsize=(6,6))
plt.title("Places chart")
ax = places['Place Category'].value_counts().plot(kind='pie', colors=placesColor)