# Practice with Python
#### _Basic analysis on Harry Potter's characters_
File to practice running some code in Jupyter Lab or Jupyter Notebook; also you can run it in Visual Studio Code with Python add in.

## Import libraries and dataset

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Open data of characters from Harry Potter movies
# Original data from Kaggle, Harry Potter Movies Dataset, by Maryna Antonevych
my_url = 'https://raw.githubusercontent.com/vcuspinera/Datasets/main/harry_potter_movies/Characters.csv'
df = pd.read_csv(my_url, encoding='latin1')

# Look database
df.head()

Unnamed: 0,Character ID,Character Name,Species,Gender,House,Patronus,Wand (Wood),Wand (Core)
0,1,Harry Potter,Human,Male,Gryffindor,Stag,Holly,Phoenix Feather
1,2,Ron Weasley,Human,Male,Gryffindor,Jack Russell Terrier,,
2,3,Hermione Granger,Human,Female,Gryffindor,Otter,Vine,Dragon Heartstring
3,4,Albus Dumbledore,Human,Male,Gryffindor,Phoenix,Elder,Thestral Tail Hair
4,5,Rubeus Hagrid,Half-Human/Half-Giant,Male,Gryffindor,,Oak,


## Basic stats of some variables

In [3]:
# Name of columns
df.columns

Index(['Character ID', 'Character Name', 'Species', 'Gender', 'House',
       'Patronus', 'Wand (Wood)', 'Wand (Core)'],
      dtype='object')

In [4]:
# Basic info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 166 entries, 0 to 165
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Character ID    166 non-null    int64 
 1   Character Name  166 non-null    object
 2   Species         125 non-null    object
 3   Gender          125 non-null    object
 4   House           75 non-null     object
 5   Patronus        20 non-null     object
 6   Wand (Wood)     21 non-null     object
 7   Wand (Core)     18 non-null     object
dtypes: int64(1), object(7)
memory usage: 10.5+ KB


In [5]:
# Number of 'Nan' values per variable
df.isnull().sum(axis = 0)

Character ID        0
Character Name      0
Species            41
Gender             41
House              91
Patronus          146
Wand (Wood)       145
Wand (Core)       148
dtype: int64

In [6]:
# Number of characters by Species
df_species = pd.DataFrame(df.groupby('Species')['Character Name'].count().sort_values(ascending=False))
df_species

Unnamed: 0_level_0,Character Name
Species,Unnamed: 1_level_1
Human,104
Ghost,4
Centaur,3
Goblin,2
Half-Human/Half-Giant,2
House Elf,2
Werewolf,2
Acromantula,1
Basilisk,1
Giant,1


In [7]:
# Number of characters by House
df_house = pd.DataFrame(df.groupby('House')['Character Name'].count().sort_values(ascending=False))
df_house

Unnamed: 0_level_0,Character Name
House,Unnamed: 1_level_1
Gryffindor,31
Slytherin,20
Ravenclaw,12
Hufflepuff,8
Beauxbatons Academy of Magic,2
Durmstrang Institute,2


In [8]:
# Number of characters by Gender
pd.DataFrame(df.groupby('Gender')['Character Name'].count().sort_values(ascending=False))

Unnamed: 0_level_0,Character Name
Gender,Unnamed: 1_level_1
Male,82
Female,42
Human,1


## Plots

In [9]:
# To install Altair look the following link
#    https://altair-viz.github.io/getting_started/installation.html
import altair as alt
alt.renderers.enable('default')


ModuleNotFoundError: No module named 'altair'

In [10]:
# plot frequency by House
alt.Chart(df_house.reset_index().rename(columns={"Character Name":"Frequency"})
         ).mark_bar().encode(
    x='Frequency',
    y=alt.Y('House:N', sort='-x'),
    color=alt.Color('House', legend=None),
    tooltip=['House', 'Frequency']
).properties(title="Number of characters of Harry Potter's movies by House")

In [11]:
# plot frequency by Species
alt.Chart(df_species.reset_index().rename(columns={"Character Name":"Frequency"})
         ).mark_bar().encode(
    x=alt.X('Frequency', scale=alt.Scale(type='log')),
    y=alt.Y('Species:N', sort='-x'),
    color=alt.Color('Species', legend=None),
    tooltip=['Species', 'Frequency']
).properties(title="Number of characters of Harry Potter's movies by Species")