# pandas

In [None]:
import pandas as pd
import numpy as np

## Create a pandas series / dataframe

Create a series / dataframe from scratch

In [None]:
# Create series
my_series = pd.Series([1., 2., 3.])

In [None]:
my_series

In [None]:
# Create series with missing values
pd.Series([1,2,np.nan])

In [None]:
# Create data frame
h = [[1,2],[3,4]] 
df_h = pd.DataFrame(h)
print('Data Frame:', df_h)

Import data into a data frame from a .csv file

In [None]:
# Read data into a data frame from a .csv file

# First, create a simple csv

# animal,unique_id,water_need
# elephant, 1001, 500
# elephant, 1002, 600
# tiger, 1003,300
# tiger, 1004, 350
# tiger, 1005, 330
# kangaroo, 1006, 410
# monkey, 1007, 220

with open('zoo.csv','w') as outfile:
    outfile.write('animal,unique_id,water_need\n')
    outfile.write('elephant, 1001, 500\n')
    outfile.write('elephant, 1002, 600\n')
    outfile.write('tiger, 1003,300\n')
    outfile.write('tiger, 1004, 350\n')
    outfile.write('tiger, 1005, 330\n')
    outfile.write('kangaroo, 1006, 410\n')
    outfile.write('monkey, 1007, 220\n')

In [None]:
# Then load it into pandas
df_zoo = pd.read_csv('zoo.csv', delimiter = ',')

In [None]:
df_zoo

In [None]:
# In practice, you will probably never create a .csv data file for yourself, like we just did… 
# You will use pre-existing data files instead. 

# I've uploaded a small dataset here: https://raw.githubusercontent.com/kirakowalska/hello-world/master/pokemon.csv

In [None]:
# Download it to your Azure space
!curl https://raw.githubusercontent.com/kirakowalska/hello-world/master/pokemon.csv -o pokemon.csv

In [None]:
# Now read the file as before
df = pd.read_csv('pokemon.csv')

## Inspect your data

In [None]:
df

In [None]:
# Look at the first five records
df.head()

In [None]:
# Look at the last five records
df.tail()

In [None]:
# Get summary statistics
df.describe()

In [None]:
# Get a random sample of five records
df.sample(5)

## Slice your data

In [None]:
# Look for specific columns using column names
df[['Name','Speed']]

In [None]:
# Get rows using row indices
df[0:3]

In [None]:
# Combine row and column slicing
## Using column names
df.loc[0:3,['Name','Speed']]

In [None]:
## Using column indices
df.iloc[0:3,1:5]

In [None]:
# Filter rows for a specific value
df[df.Name == 'Ivysaur']

## Fix your data ;)

In [None]:
# Drop duplicates
df_no_duplicates = df.drop_duplicates('Name')

In [None]:
# Sort values
df_sorted = df.sort_values('Speed')

In [None]:
# Rename columns
df_renamed = df.rename(columns={'Type 1':'type1'})

In [None]:
# See if there are any missing values
df.isna().sum()

In [None]:
# Remove rows with missing values
df_no_nan = df.dropna(how='any')

In [None]:
# Fill in missing data
df_filled = df.fillna(value=5)

## Basic plotting

In [None]:
# Import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Plot a bar chart with counts of Type 1 
pd.value_counts(df['Type 1']).plot.bar()

In [None]:
# Create a scatter plot of attack vs. defense values
df.plot.scatter(x="Attack",y="Defense")

In [None]:
# Create a histogram with attack values
df[['Attack']].plot.hist(bins=20)