## Pandas Tutorial

Pandas is an open source, BSD-licensed library providing high-performance, easy-to-use data structures and data analysis tools for the Python programming language.

Agenda

- What is Data Frames?
- What is Data Series?
- Different operation in Pandas

In [None]:
## First step is to import pandas

import pandas as pd
import numpy as np

In [None]:
## Playing with Dataframe

df=pd.DataFrame(np.arange(0,20).reshape(5,4),index=['Row1','Row2','Row3','Row4','Row5'],columns=["Column1","Column2","Column3","Coumn4"])

In [None]:
df.to_csv('Test1.csv')

In [None]:
df.head()

In [None]:
## Accessing the elements

df.loc['Row1']

In [None]:
## Check the type

type(df.loc['Row1'])

In [None]:
df.iloc[:,:]

In [None]:
## Take the elements from the Column2
df.iloc[:,:]


In [None]:
#convert Dataframes into array
df.iloc[:,1:].values

In [None]:
data = {'Column1': ['A', 'B', 'A', 'C', 'B', 'A', 'C', 'C']}
df = pd.DataFrame(data)

print(df['Column1'].value_counts())
# df['Column1'].value_counts()

In [None]:
df=pd.read_csv('mercedesbenz.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
#Get the unique category counts
df['X0'].value_counts()

In [None]:
df[df['y']>160]

In [None]:
df_numeric = df.select_dtypes(include=[np.number])
df_numeric.corr()

In [None]:
df['X11'].value_counts()

In [None]:
import numpy as np

In [None]:
lst_data=[[1,2,3],[3,4,np.nan],[5,6,np.nan],[np.nan,np.nan,np.nan]]

In [None]:
df=pd.DataFrame(lst_data)

In [None]:
df.head()

In [None]:
## HAndling Missing Values

##Drop nan values

df.dropna(axis=0)

In [None]:
df.dropna(axis=1)

In [None]:
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f', 'h'],
                     columns=['one', 'two', 'three'])

In [None]:
df.head()

In [None]:
df2=df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])

In [None]:
df2

In [None]:
df2.dropna(axis=0)

In [None]:
pd.isna(df2['one'])

In [None]:
df2['one'].notna()

In [None]:
df2.fillna('Missing')

In [None]:
df2['one'].values

In [None]:
### Reading different data sources with the help of pandas

## CSV

In [3]:
from io import StringIO, BytesIO
import pandas as pd

In [4]:
data = ('col1,col2,col3\n'
            'x,y,1\n'
            'a,b,2\n'
            'c,d,3')

In [5]:
type(data)

str

In [6]:
pd.read_csv(StringIO(data))

Unnamed: 0,col1,col2,col3
0,x,y,1
1,a,b,2
2,c,d,3


In [8]:
## Read from specific columns
df=pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ['COL1', 'COL3'])
df

Unnamed: 0,col1,col3
0,x,1
1,a,2
2,c,3


In [None]:
df.to_csv('Test.csv')

In [None]:
## Specifying columns data types

data = ('a,b,c,d\n'
            '1,2,3,4\n'
            '5,6,7,8\n'
            '9,10,11')


In [None]:
print(data)

In [None]:
df=pd.read_csv(StringIO(data),dtype=object)

In [None]:
df

In [None]:
df['a'][1]

In [None]:
df=pd.read_csv(StringIO(data),dtype={'b':int,'c':np.float,'a':'Int64'})

In [None]:
df

In [None]:
df['a'][1]

In [None]:
## check the datatype
df.dtypes

In [None]:
## Index columns and training delimiters


In [None]:
data = ('index,a,b,c\n'
           '4,apple,bat,5.7\n'
            '8,orange,cow,10')

In [None]:
pd.read_csv(StringIO(data),index_col=0)

In [None]:
 data = ('a,b,c\n'
           '4,apple,bat,\n'
            '8,orange,cow,')

In [None]:
pd.read_csv(StringIO(data))

In [None]:
pd.read_csv(StringIO(data),index_col=False)

In [None]:
## Combining usecols and index_col
 data = ('a,b,c\n'
           '4,apple,bat,\n'
            '8,orange,cow,')

In [None]:
pd.read_csv(StringIO(data), usecols=['b', 'c'],index_col=False)

In [None]:
## Quoting and Escape Characters¶. Very useful in NLP

data = 'a,b\n"hello, \\"Bob\\", nice to see you",5'

In [None]:
pd.read_csv(StringIO(data),escapechar='\\')

In [None]:
## URL to CSV

df=pd.read_csv('https://download.bls.gov/pub/time.series/cu/cu.item',
                 sep='\t')

In [None]:
df.head()

In [None]:
## Read Json to CSV

In [None]:
Data = '{"employee_name": "James", "email": "james@gmail.com", "job_profile": [{"title1":"Team Lead", "title2":"Sr. Developer"}]}'
pd.read_json(Data)

In [None]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)

In [None]:
df.head()

In [None]:
# convert Json to csv

In [None]:
df.to_csv('wine.csv')

In [None]:
# convert Json to different json formats

df.to_json(orient="index")

In [None]:

df.to_json(orient="records")

## Reading HTML content 

In [None]:
url = 'https://www.fdic.gov/bank/individual/failed/banklist.html'

dfs = pd.read_html(url)

In [None]:
dfs[0]

In [None]:
url_mcc = 'https://en.wikipedia.org/wiki/Mobile_country_code'
dfs = pd.read_html(url_mcc, match='Country', header=0)

In [None]:
dfs[0]

## Reading EXcel Files

In [None]:
df_excel=pd.read_excel('Excel_Sample.xlsx')

In [None]:
df_excel.head()

## Pickling
All pandas objects are equipped with to_pickle methods which use Python’s cPickle module to save data structures to disk using the pickle format.

In [None]:
df_excel.to_pickle('df_excel123')

In [None]:
df=pd.read_pickle('df_excel')

In [None]:
df.head()