# Advantages of Pandas

* A fast and efficient DataFrame object for data manipulation with integrated indexing
* Tools for reading and writing data between in-memory data structures and different formats: CSV and text files, Microsoft Excel, SQL databases, and the fast HDF5 format 

In [2]:
import pandas as pd
import numpy as np
data = np.arange(5000).reshape(100,50)
data

array([[   0,    1,    2, ...,   47,   48,   49],
       [  50,   51,   52, ...,   97,   98,   99],
       [ 100,  101,  102, ...,  147,  148,  149],
       ...,
       [4850, 4851, 4852, ..., 4897, 4898, 4899],
       [4900, 4901, 4902, ..., 4947, 4948, 4949],
       [4950, 4951, 4952, ..., 4997, 4998, 4999]])

# Data Frames

In [3]:
data

array([[   0,    1,    2, ...,   47,   48,   49],
       [  50,   51,   52, ...,   97,   98,   99],
       [ 100,  101,  102, ...,  147,  148,  149],
       ...,
       [4850, 4851, 4852, ..., 4897, 4898, 4899],
       [4900, 4901, 4902, ..., 4947, 4948, 4949],
       [4950, 4951, 4952, ..., 4997, 4998, 4999]])

In [4]:
columnName = ['columns_'+str(i) for i in range(50)]
columnName

['columns_0',
 'columns_1',
 'columns_2',
 'columns_3',
 'columns_4',
 'columns_5',
 'columns_6',
 'columns_7',
 'columns_8',
 'columns_9',
 'columns_10',
 'columns_11',
 'columns_12',
 'columns_13',
 'columns_14',
 'columns_15',
 'columns_16',
 'columns_17',
 'columns_18',
 'columns_19',
 'columns_20',
 'columns_21',
 'columns_22',
 'columns_23',
 'columns_24',
 'columns_25',
 'columns_26',
 'columns_27',
 'columns_28',
 'columns_29',
 'columns_30',
 'columns_31',
 'columns_32',
 'columns_33',
 'columns_34',
 'columns_35',
 'columns_36',
 'columns_37',
 'columns_38',
 'columns_39',
 'columns_40',
 'columns_41',
 'columns_42',
 'columns_43',
 'columns_44',
 'columns_45',
 'columns_46',
 'columns_47',
 'columns_48',
 'columns_49']

In [5]:
#Putting Data in tabular format- converting to a dataframe
dataSamp = pd.DataFrame(data)

In [6]:
dataSamp.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
1,50,51,52,53,54,55,56,57,58,59,...,90,91,92,93,94,95,96,97,98,99
2,100,101,102,103,104,105,106,107,108,109,...,140,141,142,143,144,145,146,147,148,149
3,150,151,152,153,154,155,156,157,158,159,...,190,191,192,193,194,195,196,197,198,199
4,200,201,202,203,204,205,206,207,208,209,...,240,241,242,243,244,245,246,247,248,249


In [7]:
type(dataSamp)

pandas.core.frame.DataFrame

In [8]:
dataSamp[0]

0        0
1       50
2      100
3      150
4      200
      ... 
95    4750
96    4800
97    4850
98    4900
99    4950
Name: 0, Length: 100, dtype: int64

In [9]:
type(dataSamp[0])

pandas.core.series.Series

In [10]:
dataSamp = pd.DataFrame(data,columns=columnName)

In [11]:
dataSamp

Unnamed: 0,columns_0,columns_1,columns_2,columns_3,columns_4,columns_5,columns_6,columns_7,columns_8,columns_9,...,columns_40,columns_41,columns_42,columns_43,columns_44,columns_45,columns_46,columns_47,columns_48,columns_49
0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
1,50,51,52,53,54,55,56,57,58,59,...,90,91,92,93,94,95,96,97,98,99
2,100,101,102,103,104,105,106,107,108,109,...,140,141,142,143,144,145,146,147,148,149
3,150,151,152,153,154,155,156,157,158,159,...,190,191,192,193,194,195,196,197,198,199
4,200,201,202,203,204,205,206,207,208,209,...,240,241,242,243,244,245,246,247,248,249
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,4750,4751,4752,4753,4754,4755,4756,4757,4758,4759,...,4790,4791,4792,4793,4794,4795,4796,4797,4798,4799
96,4800,4801,4802,4803,4804,4805,4806,4807,4808,4809,...,4840,4841,4842,4843,4844,4845,4846,4847,4848,4849
97,4850,4851,4852,4853,4854,4855,4856,4857,4858,4859,...,4890,4891,4892,4893,4894,4895,4896,4897,4898,4899
98,4900,4901,4902,4903,4904,4905,4906,4907,4908,4909,...,4940,4941,4942,4943,4944,4945,4946,4947,4948,4949


In [12]:
dataSamp.head()

Unnamed: 0,columns_0,columns_1,columns_2,columns_3,columns_4,columns_5,columns_6,columns_7,columns_8,columns_9,...,columns_40,columns_41,columns_42,columns_43,columns_44,columns_45,columns_46,columns_47,columns_48,columns_49
0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
1,50,51,52,53,54,55,56,57,58,59,...,90,91,92,93,94,95,96,97,98,99
2,100,101,102,103,104,105,106,107,108,109,...,140,141,142,143,144,145,146,147,148,149
3,150,151,152,153,154,155,156,157,158,159,...,190,191,192,193,194,195,196,197,198,199
4,200,201,202,203,204,205,206,207,208,209,...,240,241,242,243,244,245,246,247,248,249


In [None]:
dict1 = {'name1': 'XYZ1', 'name2': 'XYZ2'}

In [None]:
dict2 = {'score1': 15, 'score2': 25}

In [17]:
pd.DataFrame(data=(dict1,dict2))

Unnamed: 0,name1,name2,score1,score2
0,XYZ1,XYZ2,,
1,,,15.0,25.0


In [30]:
dict1 = {'name1': list(range(10,0,-1)), 'name2': list(range(10,20))}
dict1

{'name1': [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
 'name2': [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]}

In [31]:
pd.DataFrame(data = (dict1))

Unnamed: 0,name1,name2
0,10,10
1,9,11
2,8,12
3,7,13
4,6,14
5,5,15
6,4,16
7,3,17
8,2,18
9,1,19


## Read Data

In [34]:
data = pd.read_csv('dataset_32_pendigits.csv')

In [35]:
# By default it shows top 5. We can modify the number of rows by passing row_count value in paranthesis
data.head() 

Unnamed: 0,input1,input2,input3,input4,input5,input6,input7,input8,input9,input10,input11,input12,input13,input14,input15,input16,class
0,47,100,27,81,57,37,26,0,0,23,56.0,53,100,90,40,98,8
1,0,89,27,100,42,75,29,45,15,15,37.0,0,69,2,100,6,2
2,0,57,31,68,72,90,100,100,76,75,50.0,51,28,25,16,0,1
3,0,100,7,92,5,68,19,45,86,34,100.0,45,74,23,67,0,4
4,0,67,49,83,100,100,81,80,60,60,40.0,40,33,20,47,0,1


In [36]:
data.tail()

Unnamed: 0,input1,input2,input3,input4,input5,input6,input7,input8,input9,input10,input11,input12,input13,input14,input15,input16,class
10987,36,100,24,70,0,38,49,33,95,47,87.0,55,96,21,100,0,4
10988,16,75,41,100,52,64,32,27,0,0,21.0,9,62,2,100,14,2
10989,56,100,27,79,0,39,12,0,66,15,100.0,51,93,93,38,93,0
10990,19,100,0,61,3,23,48,0,97,27,100.0,66,62,97,10,81,0
10991,38,100,37,81,12,55,0,28,52,27,100.0,42,86,26,65,0,4


In [38]:
data.columns

Index(['input1', 'input2', 'input3', 'input4', 'input5', 'input6', 'input7',
       'input8', 'input9', 'input10', 'input11', 'input12', 'input13',
       'input14', 'input15', 'input16', 'class'],
      dtype='object')

In [39]:
data.dtypes

input1       int64
input2       int64
input3       int64
input4       int64
input5       int64
input6       int64
input7       int64
input8       int64
input9       int64
input10      int64
input11    float64
input12      int64
input13      int64
input14      int64
input15      int64
input16      int64
class        int64
dtype: object

In [41]:
data.describe()

Unnamed: 0,input1,input2,input3,input4,input5,input6,input7,input8,input9,input10,input11,input12,input13,input14,input15,input16,class
count,10992.0,10992.0,10992.0,10992.0,10992.0,10992.0,10992.0,10992.0,10992.0,10992.0,10744.0,10992.0,10992.0,10992.0,10992.0,10992.0,10992.0
mean,38.81432,85.120269,40.605622,83.774199,49.770378,65.573144,51.220251,44.498999,56.868541,33.695961,59.769918,34.82651,55.022289,34.937045,47.287482,28.845342,4.431587
std,34.257783,16.218571,26.342984,19.163646,34.100515,26.996688,30.576881,29.906104,34.13553,27.251548,37.385931,27.119982,22.335539,33.155463,41.7604,35.778094,2.876947
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.0,76.0,20.0,72.0,18.0,49.0,28.0,23.0,29.0,7.0,22.0,11.0,42.0,5.0,0.0,0.0,2.0
50%,32.0,89.0,40.0,91.0,53.0,71.0,53.5,43.0,60.0,33.0,72.0,30.0,53.0,27.0,40.0,9.0,4.0
75%,65.0,100.0,58.0,100.0,78.0,86.0,74.0,64.0,89.0,54.0,98.0,55.0,68.0,47.0,100.0,51.0,7.0
max,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,9.0


In [42]:
data.index

RangeIndex(start=0, stop=10992, step=1)

In [49]:
data.count()

input1     10992
input2     10992
input3     10992
input4     10992
input5     10992
input6     10992
input7     10992
input8     10992
input9     10992
input10    10992
input11    10744
input12    10992
input13    10992
input14    10992
input15    10992
input16    10992
class      10992
dtype: int64

In [51]:
data.mean()

input1     38.814320
input2     85.120269
input3     40.605622
input4     83.774199
input5     49.770378
input6     65.573144
input7     51.220251
input8     44.498999
input9     56.868541
input10    33.695961
input11    59.769918
input12    34.826510
input13    55.022289
input14    34.937045
input15    47.287482
input16    28.845342
class       4.431587
dtype: float64

In [53]:
data.min()

input1     0.0
input2     0.0
input3     0.0
input4     0.0
input5     0.0
input6     0.0
input7     0.0
input8     0.0
input9     0.0
input10    0.0
input11    0.0
input12    0.0
input13    0.0
input14    0.0
input15    0.0
input16    0.0
class      0.0
dtype: float64

In [54]:
data.max()

input1     100.0
input2     100.0
input3     100.0
input4     100.0
input5     100.0
input6     100.0
input7     100.0
input8     100.0
input9     100.0
input10    100.0
input11    100.0
input12    100.0
input13    100.0
input14    100.0
input15    100.0
input16    100.0
class        9.0
dtype: float64

In [55]:
data.median()

input1     32.0
input2     89.0
input3     40.0
input4     91.0
input5     53.0
input6     71.0
input7     53.5
input8     43.0
input9     60.0
input10    33.0
input11    72.0
input12    30.0
input13    53.0
input14    27.0
input15    40.0
input16     9.0
class       4.0
dtype: float64