# Pandas exercises

### Introduction:

This exercise was inspired by this [page](http://chrisalbon.com/).

### Import the necessary libraries

In [None]:
import pandas as pd
import numpy as np

### This is the data given as a dictionary

In [None]:
# Create an example dataframe about a fictional army
raw_data = {'regiment': ['Nighthawks', 'Nighthawks', 'Nighthawks', 'Nighthawks', 'Dragoons', 'Dragoons', 'Dragoons', 'Dragoons', 'Scouts', 'Scouts', 'Scouts', 'Scouts'],
            'company': ['1st', '1st', '2nd', '2nd', '1st', '1st', '2nd', '2nd','1st', '1st', '2nd', '2nd'],
            'deaths': [523, 52, 25, 616, np.NaN, 234, 523, 62, 62, 73, 37, 35],
            'battles': [5, 42, 2, 2, 4, 7, 8, 3, 4, 7, 8, 9],
            'size': [1045, np.NaN, 1099, 1400, 1592, np.NaN, 987, np.NaN, 973, 1005, 1099, 1523],
            'veterans': [1, 5, 62, 26, 73, 37, 949, 48, 48, 435, 63, 345],
            'readiness': [1, 2, 3, 3, 2, 1, 2, 3, 2, 1, 2, 3],
            'armored': [1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1],
            'deserters': [4, 24, 31, 2, 3, 4, 24, 31, 2, 3, 2, 3],
            'origin': ['Arizona', 'California', 'Texas', 'Florida', 'Maine', 'Iowa', 'Alaska', 'Washington', 'Oregon', 'Wyoming', 'Louisana', 'Georgia']}

### Create a dataframe and assign it to a variable called army. 

#### Don't forget to include the columns names in the order presented in the dictionary ('regiment', 'company', 'deaths'...) so that the column index order is consistent with the solutions. If omitted, pandas will order the columns alphabetically.

In [None]:
army = pd.DataFrame(raw_data, columns = ['regiment', 'company', 'deaths', 'battles', 'size', 'veterans', 'readiness', 'armored', 'deserters', 'origin'])

### Set the 'origin' colum as the index of the dataframe

In [None]:
army = army.set_index('origin')
army

### See the first five etries in the dataframe

In [None]:
army.head()

### What is the number of observations in the dataset?

In [None]:
army.shape[0] 

### What is the number of columns in the dataset?

In [None]:
print(army.shape) #will give you both (observations/rows, columns)
print(army.shape[1]) #will give you only the columns number

#OR

army.info() #Columns: 163 entries

### Print only the column veterans

In [None]:
army['veterans']

### Print the columns 'veterans' and 'deaths'

In [None]:
army[['veterans', 'deaths']]

### Print the name of all the columns.

In [None]:
army.columns

### Select the 'deaths', 'size' and 'deserters' columns from Maine and Alaska

In [None]:
# Select all rows with the index label "Maine" and "Alaska"
army.loc[['Maine','Alaska'] , ["deaths","size","deserters"]]

### Select the rows 3 to 7 and the columns 3 to 6

In [None]:
#
army.iloc[3:7, 3:6]

### Select rows where df.deaths is greater than 50

In [None]:
army[army['deaths'] > 50]

### Select rows where df.deaths is greater than 500 or less than 50

In [None]:
army[(army['deaths'] > 500) | (army['deaths'] < 50)]

### Select all the regiments not named "Dragoons"

In [None]:
army[(army['regiment'] != 'Dragoons')]

### Get summary statistics for each column

In [None]:
army.describe()

### Compute how many missing values are there in each column

In [None]:
army.isnull().sum()

### Drop rows that are missing `deaths` information

In [None]:
army.dropna(subset=['deaths'])

### Fill in missing `size` entries with the mean size

In [None]:
army['size'].fillna(army['size'].mean(), inplace = True)

In [None]:
army