<a href="https://colab.research.google.com/github/techakilan/python-datascience-training/blob/master/PythonPandas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

##### Get Version

In [2]:

print(pd.__version__) 

1.1.5


### Series

In [3]:
mylist = [1, 7, 2]
myseries = pd.Series(mylist)
print(myseries)

0    1
1    7
2    2
dtype: int64


In [4]:
type(myseries)

pandas.core.series.Series

##### Default index

In [5]:
print(myseries)

0    1
1    7
2    2
dtype: int64


In [6]:
# access items in a series
print(myseries[0])
print(myseries[1])
print(myseries[2])

1
7
2


##### Labelled Index

In [7]:
mylist = [1, 7, 2]
myseries =  pd.Series(mylist, index = ["x", "y", "z"])
print(myseries)

x    1
y    7
z    2
dtype: int64


In [8]:
# access items in a series
print(myseries["x"])
print(myseries["y"])
print(myseries["z"])

1
7
2


In [9]:
# access items in a series
print(myseries[0])
print(myseries[1])
print(myseries[2])

1
7
2


##### Create series using Dict

In [10]:
# Dict showing daywise calories

mydict = {"day1": 420, "day2": 380, "day3": 390}
myseries =  pd.Series(mydict)
print(myseries)

day1    420
day2    380
day3    390
dtype: int64


In [11]:
# access items in a series
print(myseries["day1"])
print(myseries["day2"])
print(myseries["day3"])

420
380
390


In [12]:
# access items in a series
print(myseries[0])
print(myseries[1])
print(myseries[2])

420
380
390


In [13]:
# create series using only day 1 and day 2 calories data

newseries =  pd.Series(mydict,["day1","day2"])
print(newseries)

day1    420
day2    380
dtype: int64


### DataFrame

##### Create dataframe

In [14]:
# create a column(in this example we use list) called calories

calories = [420, 380, 390]
print(calories)
print(type(calories))

[420, 380, 390]
<class 'list'>


In [15]:
# create a column(in this example we use list) called duration

duration = [50, 40, 45]
print(duration)
print(type(duration))


[50, 40, 45]
<class 'list'>


In [16]:
# create data for dataframe using the columns (list) calories and duration

mydata = {"calories":calories, "duration":duration} 
print(mydata)
print(type(mydata))

{'calories': [420, 380, 390], 'duration': [50, 40, 45]}
<class 'dict'>


In [17]:
# create dataframe
mydataframe = pd.DataFrame(mydata)
print(mydataframe)
print(type(mydataframe))

   calories  duration
0       420        50
1       380        40
2       390        45
<class 'pandas.core.frame.DataFrame'>


##### Get a row/rows from a dataframe

In [18]:
# get first row from dataframe
mydataframe.loc[0]

calories    420
duration     50
Name: 0, dtype: int64

In [19]:
# data type of a row in a dataframe
type(mydataframe.loc[0])

pandas.core.series.Series

In [20]:
#get last row in a dataframe
length=len(mydataframe)
mydataframe.loc[length-1]


calories    390
duration     45
Name: 2, dtype: int64

In [21]:
# get multiple rows from a dataframe - Filtering

mydataframe.loc[[0,1]]

Unnamed: 0,calories,duration
0,420,50
1,380,40


###### Index of a dataframe

In [22]:
# Default Index
mydataframe

Unnamed: 0,calories,duration
0,420,50
1,380,40
2,390,45


In [23]:
# Named index
namedindexdf = pd.DataFrame(mydata,["day1","day2","day3"])
print(namedindexdf)

      calories  duration
day1       420        50
day2       380        40
day3       390        45


In [24]:
# Locate using named index
namedindexdf.loc["day1"]

calories    420
duration     50
Name: day1, dtype: int64

In [25]:
# filter
namedindexdf.loc[["day1","day2"]]

Unnamed: 0,calories,duration
day1,420,50
day2,380,40


### CSV

##### Create a csv file

In [26]:
import csv

header = ['name', 'area', 'country_code2', 'country_code3']
data = [
    ['Albania', 28748, 'AL', 'ALB'],
    ['Algeria', 2381741, 'DZ', 'DZA'],
    ['American Samoa', 199, 'AS', 'ASM'],
    ['Andorra', 468, 'AD', 'AND'],
    ['Angola', 1246700, 'AO', 'AGO'],
    ['Anguilla',102, 'AI','AIA']
]

with open('countries.csv', 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)

    # write the header
    writer.writerow(header)

    # write multiple rows
    writer.writerows(data)
f.close()

### Load data from a file into a dataframe

In [27]:
# Load a csv file
df = pd.read_csv('countries.csv')

In [28]:
# print the dataframe
print(df)

             name     area country_code2 country_code3
0         Albania    28748            AL           ALB
1         Algeria  2381741            DZ           DZA
2  American Samoa      199            AS           ASM
3         Andorra      468            AD           AND
4          Angola  1246700            AO           AGO
5        Anguilla      102            AI           AIA


To print entire dataframe use print(df.tostring())

### JSON

In [29]:
# create json data from existing data
datajson = {}
headerlength = len(header)
datacount = len(data)
for k in range(headerlength):
  datajson[header[k]] = {}

for i in range(datacount):
  for j in range(headerlength):   
      datajson[header[j]][i] = data[i][j]
  
print(datajson)

{'name': {0: 'Albania', 1: 'Algeria', 2: 'American Samoa', 3: 'Andorra', 4: 'Angola', 5: 'Anguilla'}, 'area': {0: 28748, 1: 2381741, 2: 199, 3: 468, 4: 1246700, 5: 102}, 'country_code2': {0: 'AL', 1: 'DZ', 2: 'AS', 3: 'AD', 4: 'AO', 5: 'AI'}, 'country_code3': {0: 'ALB', 1: 'DZA', 2: 'ASM', 3: 'AND', 4: 'AGO', 5: 'AIA'}}


In [32]:
import json
# write json data to file
with open('data.json', 'w') as outfile:
    json.dump(datajson, outfile)

In [33]:
# Load country data into dataframe
jsondf =  pd.DataFrame(datajson)
jsondf

Unnamed: 0,name,area,country_code2,country_code3
0,Albania,28748,AL,ALB
1,Algeria,2381741,DZ,DZA
2,American Samoa,199,AS,ASM
3,Andorra,468,AD,AND
4,Angola,1246700,AO,AGO
5,Anguilla,102,AI,AIA


In [34]:
# load json data from file
df = pd.read_json('data.json')
print(df) 

             name     area country_code2 country_code3
0         Albania    28748            AL           ALB
1         Algeria  2381741            DZ           DZA
2  American Samoa      199            AS           ASM
3         Andorra      468            AD           AND
4          Angola  1246700            AO           AGO
5        Anguilla      102            AI           AIA


### CSV from Google drive

##### Mount google drive

In [35]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


##### Import csv file from google drive and create dataframe

In [36]:
iris_df = pd.read_csv('/content/drive/MyDrive/colabs_data/Iris.csv')
iris_df

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica


#### head()

In [37]:
iris_df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [38]:
iris_df.head(10)

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
5,6,5.4,3.9,1.7,0.4,Iris-setosa
6,7,4.6,3.4,1.4,0.3,Iris-setosa
7,8,5.0,3.4,1.5,0.2,Iris-setosa
8,9,4.4,2.9,1.4,0.2,Iris-setosa
9,10,4.9,3.1,1.5,0.1,Iris-setosa


#### tail()

In [39]:
iris_df.tail()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica
149,150,5.9,3.0,5.1,1.8,Iris-virginica


In [40]:
iris_df.tail(10)

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
140,141,6.7,3.1,5.6,2.4,Iris-virginica
141,142,6.9,3.1,5.1,2.3,Iris-virginica
142,143,5.8,2.7,5.1,1.9,Iris-virginica
143,144,6.8,3.2,5.9,2.3,Iris-virginica
144,145,6.7,3.3,5.7,2.5,Iris-virginica
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica
149,150,5.9,3.0,5.1,1.8,Iris-virginica


#### Get info about the dataframe

In [None]:
iris_df.info()

In [None]:
iris_df.describe()

### Handling missing data

In [43]:
titanic_df = pd.read_csv('/content/drive/MyDrive/colabs_data/titanic/train.csv')

In [None]:
titanic_df.info()

#### fillna()