# Pandas Introduction

In [None]:
import numpy as np
import pandas as pd

## Series object

In [None]:
series_obj = pd.Series([31,22,43,44,55])
series_obj

In [None]:
series_obj[0]

In [None]:
series_obj = pd.Series([31,22,43,44,55], index = ['a', 'b', 'c', 'd', 'e'], name = "Column_1")
# OR obj.index = ['a', 'b', 'c', 'd', 'e']
#    series_obj.name = "Column_1"
series_obj

In [None]:
series_obj['a']   # like a dictionary

In [None]:
series_obj*2

In [None]:
over_35 = series_obj>35
over_35

In [None]:
series_obj[over_35]

## DataFrame object
### **Create a DataFrame using a dictionary**

In [None]:
# create a DataFrame using dictionary (of Series objects)
data = {"Name": ["Tim Miller", "Ann Carter", "Ellen Lee"], 
        "Gender": ["Male", "Female", "Female"],
        "Age": [32, 44, 21]}
df = pd.DataFrame(data)
# print(df)  #does not display as an HTML table
df

In [None]:
df.head() # == df.head(5)

In [None]:
df.tail()  # == df.tail(5)

In [None]:
# Series object
df['Name']     # dictionary notation

In [None]:
df.Name     # attribute notation; Tab completion

In [None]:
# assignment by column
df["Birth Year"] = 1999
df["Birth Year"]

In [None]:
# add a column
df["Married"] = ['Yes', 'Yes', 'No']     # must match the length/index of the DataFrame
df

In [None]:
df["Married"] = 'Yes'
df

## Selection and Filtering
### Select Columns

In [None]:
# create a new DataFrame
data = pd.DataFrame(np.arange(100).reshape(10,10), columns = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'])
data

In [None]:
data['a']

In [None]:
data[["a", "e", "j"]]    # providing a single value or a list selects columns

In [None]:
# boolean selection
data[data["j"] > 40]

### Select rows

In [None]:
data[:1]     # use slice syntax to select rows

In [None]:
data[5:9]

### Selection with loc and iloc
Allows you to select a subset of the rows and columns

In [None]:
data

In [None]:
# loc implies the name/label of the row and column
data.loc[1:5,"b"]

In [None]:

data.loc[:5, 'a':'e']     # consecutive (loc selection is inclusive)

In [None]:
data.loc[:5, ['c', 'f', 'i']]     # not consecutive

In [None]:
# iloc is for integer/index selection  (iloc selection is exclusive)
data.iloc[:5, 2:5]

In [None]:
data.iloc[4]  # gives you a row, assumes all of the columns

In [None]:
data.iloc[[5, 0, 3], [9, 5, 0]]    # returns them in the order listed

In [None]:
df.columns   # returns the names of the columns within the DataFrame

### Descriptive and summary statistics

In [None]:
from sklearn.datasets import load_iris

iris = load_iris()

type(iris)

import pandas as pd
ddff = pd.DataFrame(iris.data, columns = iris.feature_names)
ddff.head()

iris.feature_names

In [None]:
# get external data and change header names
new_iris_data = pd.read_csv("test_iris.csv")
new_iris_data.head()

In [None]:
#####################
#####################

iris_data = pd.read_csv("iris.csv", header=None)
iris_data

In [None]:
# import the iris data
iris_data = pd.read_csv("iris.csv", names = ["sepal_l", "sepal_w", "petal_l", "petal_w", "class"])

In [None]:
iris_data.head()  # head() returns the first 5 columns by default; tail() returns the botom 5

In [None]:
iris_data.describe()

In [None]:
iris_data.columns

In [None]:
iris_data["class"]

In [None]:
set(iris_data["class"])

In [None]:
iris_data["class"].describe()      # non-numerical data

In [None]:
# some Descriptive and Summary statistics  (min, max, idxmin, idxmax, mean, median, std, count, corr)
iris_data.min() 

In [None]:
iris_data["sepal_l"].head(20)

In [None]:
# .count() will return the number of items in an object
iris_data.loc[iris_data['sepal_l'] > 4.9, "sepal_l"].count()

In [None]:
# you can also use the built-in python function len() to return the length of an object
len(iris_data[iris_data['sepal_l'] > 4.9])

In [None]:
# use unique to get the number of unique items within a Series/column.
iris_data["class"].unique()

In [None]:
# use value_counts() to get the quantity of each unique item within a Series/column.
iris_data["class"].value_counts()

### Transforming and Cleaning Data

In [None]:
# create a new DataFrame
data = pd.DataFrame({'age': [0, 26, 41, 0], 'gender': ["Male", "Female", "Female", "Female"]})
data

In [None]:
# Transform categorical variables into binary (discreet) variables
# map enables convenient element-wise transformations

data['gender'] = data['gender'].map({'Male': 0, 'Female': 1})
data

In [None]:
data.mean(axis=0)

In [None]:
# replace values 
# nan ("not a number") values are not used in calculating the mean, etc.
 
data['age'] = data['age'].replace(0, np.nan)
data

# replace provides flexibility
# can also pass a list of multiple values to replace (e.g., replace([0, -1], np.nan)
# can provide a different replacement for each value (e.g., replace([0, -1], [np.nan, 1])

In [None]:
data.mean()  # axis = 0 is the default

In [None]:
# print formatting
# the number preceding the colon is the index position (0 is the default) within the format() tuple 
        # of the string that you want printed.
# following the colon is the formatting instructions: 
        # how many places following the decimal should be printed for the float.
    
print("The mean age is: {0:.2f}".format(data["age"].mean(), 99))

### Null (NaN) values

In [None]:
# nan can be used as a sentinel to drop or impute/replace a value

data['age'].isnull()    # notnull()

In [None]:
mask = data['age'].notnull() 
mask

In [None]:
data.loc[mask, 'age']

### Impute missing values

In [None]:
data.loc[data['age'].isnull(),'age'] = data['age'].mean()
data

### Drop a feature

In [None]:
# if a feature (column) is redundant you can drop it
# drop returns a copy (the DataFrame "data" is unchanged, the modified DataFrame is copied to "data_2")
data_2  = data.drop(['age'], axis=1)   # axis=0 would mean to drop a row named "age"

# alternatively, you can set inplace=True to modified the original DataFrame "data"
data.drop(['age'], inplace=True, axis=1)

### Boolean Selection 

In [None]:
iris_data.shape

In [None]:
iris_data[iris_data['sepal_w'] < 3]

In [None]:
# isin() is a boolean check to see if items within a columns are included in a list
iris_data["class"].isin(['Iris-setosa'])

In [None]:
# include only the items that are included within a given list of items
mask = iris_data["class"].isin(['Iris-setosa', 'Iris-virginica'])
iris_data[mask]

## Boolean selection within pandas

In [None]:
# And
iris_data.loc[(iris_data['sepal_w'] < 3) & (iris_data['sepal_l'] > 5)].head()

In [None]:
# Following the selection of rows, list the columns to show.

iris_data.loc[(iris_data['sepal_w'] < 3) & (iris_data['sepal_l'] > 5), ['sepal_w','class']].head() # [:5] 

In [None]:
# Or
iris_data.loc[(iris_data['sepal_w'] < 2.5) | (iris_data['sepal_w'] > 3.5)]

In [None]:
# Following the selection of rows, list the series of columns to show (inclusive).

iris_data.loc[(iris_data['sepal_w'] < 2.5) | (iris_data['sepal_w'] > 3.5), 'sepal_l':'petal_w']


In [None]:
# return only the values, not the DataFrame
   # capital X usually indicates all of the features that the algorithm will be given to learn from

X = iris_data.loc[(iris_data['sepal_w'] < 2.5) | (iris_data['sepal_w'] > 3.5), 'sepal_l':'petal_w'].values
X

In [None]:
# lowercase y usually indicates the classifications that you would like to predict.
    # Most algorithms require strings to be converted to numbers (you can use .map() to accomplish this).


y = iris_data['class'].map({'Iris-setosa':0, 'Iris-versicolor':1, 'Iris-virginica':2})
y

## Archive cleaned and transformed DataFrame

In [None]:
# to save your cleaned data to file
df.to_csv("new_filename.csv")