# Working with Pandas - Part 2

In [None]:
import pandas as pd
import numpy as np

In [None]:
s = pd.Series({'A': 15, 'B': 8, 'C': 6, 'D': 2, 'E': 10})  #Creating a pandas Series object
s

In [None]:
df = pd.DataFrame({'age': s, 'test': {'A': 2.6, 'B': 69.27, 'C': 14.2, 'D': 8.0, 'E': 5.93}})  #Creating a dataframe
df

In [None]:
state_data = {'State':['Alabama','Alaska','Arizona','Arkansas'], 'PostCode':['AL','AK','AZ','AR'], 'Area':['52,423', '656,424','*','53,182'], 'Pop':['4,040,587', '550,043', '3,665,228','2,350,725']}
state_data

#When we define the dataframe, we can use the columns argument to set the order of the columns. 

stdf = pd.DataFrame(state_data, columns=['State','PostCode','Area','Pop'])
stdf


In [None]:
stdf['Area']   # gets the Area column
#stdf.Area	# also gets the Area column



In [None]:
stdf['Area'][0]   # gets the item at index 0, column ‘Area’


In [None]:
stdf[0:2]

In [None]:
stdf[0:2]["Pop"]   #Getting complicated. Pandas offers better wasy to navigate dataframes - we will see them later

In [None]:
#Next we redefine the index values to be the State column.

stdf2 = stdf.set_index('State')
stdf2


## Slicing - using loc and iloc

The loc[ ] property is used to slice a pandas DataFrame or Series and access row(s) and column(s) **by label**

In [None]:
stdf2.loc[:,"Area":"Pop"]

In [None]:
stdf.loc[:,"Area":"Pop"]   #The non-indexed version result is different

In [None]:
stdf2.loc["Alaska":"Arkansas","Area":"Pop"]  #It will include the whole range of items you specify

In [None]:
stdf2

In [None]:
stdf2.iloc[1]  #Row 1 (counting from zero ) = row 2 counting from 1 = The row with the data for Alaska

In [None]:
stdf2.iloc[1:4,1:3]  #same as previous loc[] statement

## Cleaning and understanding a dataset

In [None]:
#Replacing '*' with '0'

stdf = stdf.replace('*','0')
stdf


We want to remove the commas "," in the values

In [None]:
# Get information on the structure and data types in the data frame - Very useful
stdf.info()

In [None]:
mean_pop = stdf['Pop'].mean()  # Let's get the average population value

There are issues with the data type for some of the columns. Let's try to fix them

In [None]:
stdf['Area'] = stdf['Area'].astype(int)
stdf

The commas are getting in the way

We will use the map() function to start cleaning the dataset. See: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.map.html

In [None]:
def item_replace(xstr):
   return xstr.replace(',','')        # in a string, replace any occurrence of ‘,’ with empty string

stdf['Pop'] = stdf['Pop'].map(item_replace)
stdf


In [None]:
stdf['Area'] = stdf['Area'].map(item_replace)
stdf


In [None]:
stdf[['Area','Pop']] = stdf[['Area','Pop']].astype(int)   #Try again to convert to integer
stdf

In [None]:
stdf.dtypes   #Verify

In [None]:
mean_pop = stdf['Pop'].mean()  # Try again to get the average population value
mean_pop

In [None]:
mean_area = stdf['Area'].mean()
mean_area

We will use the mask method to replace the 0 area value with the mean area value. https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.mask.html

In [None]:
stdf['Area']=stdf.Area.mask(stdf.Area == 0, mean_area) # Assigns the mean to any zero values
stdf

## Handling NAN values

In [None]:
d1 = {'A' : ['Alpha','Beta','Gamma', 'Delta'], 'B' : [11., 3., np.nan, 1.]}
df1 = pd.DataFrame(d1)
df1


In [None]:
mean_B = df1['B'].mean(skipna=True)
mean_B


In [None]:
mean_B = df1['B'].mean()  #skipna was there by default - check documentation when in doubt of enforce it explicitly
mean_B


In [None]:
df1['B'] = df1['B'].mask(df1['B'].isnull(), mean_B)
df1


## Deleting rows or columns

In [None]:
df1.drop(2)    #takes out entry 2 but does not delete it from the dataframe

In [None]:
df1

In [None]:
df1.drop(2, inplace=True)

In [None]:
df1

Dropping a column

In [None]:
df1.drop('A', axis=1)

In [None]:
df1