In [1]:
!pip install pandas



In [None]:
import pandas as pd
import numpy as np

<h3>Introduction to Data Structures</h3>

<table class="table table-bordered">
<tbody><tr>
<th style="text-align:center;">Data Structure</th>
<th style="text-align:center;">Dimensions</th>
<th style="text-align:center;">Description</th>
</tr>
<tr>
<td style="text-align:center;">Series</td>
<td style="text-align:center;">1</td>
<td style="text-align:center;">1D labeled homogeneous array, sizeimmutable.</td>
</tr>
<tr>
<td style="text-align:center;">Data Frames</td>
<td style="text-align:center;">2</td>
<td style="text-align:center;">General 2D labeled, size-mutable tabular structure with potentially heterogeneously typed
columns.</td>
</tr>
<tr>
<td style="text-align:center;">Panel</td>
<td style="text-align:center;">3</td>
<td style="text-align:center;">General 3D labeled, size-mutable array.</td>
</tr>
</tbody></table>

<h3>Python Pandas - Series</h3>

pandas.Series( data, index, dtype, copy)

<table class="table table-bordered">
<tbody><tr>

<th style="text-align:center;">Parameter &amp; Description</th>
</tr>
<tr>

<td style="text-align:center;">
<p><b>data</b></p>
<p>data takes various forms like ndarray, list, constants</p>
</td>
</tr>
<tr>

<td style="text-align:center;">
<p><b>index</b></p>
<p>Index values must be unique and hashable, same length as data. Default <b>np.arrange(n)</b> if no index is passed.</p>
</td>
</tr>
<tr>

<td style="text-align:center;">
<p><b>dtype</b></p>
<p>dtype is for data type. If None, data type will be inferred</p>
</td>
</tr>
<tr>

<td style="text-align:center;">
<p><b>copy</b></p>
<p>Copy data. Default False</p>
</td>
</tr>
</tbody></table>

In [4]:
data = np.array(['a','b','c','d'])
s = pd.Series(data)
print(s)
# the left colum is the index and right column is the corresponding value

0    a
1    b
2    c
3    d
dtype: object


In [6]:
s = pd.Series(data,index=[100,101,102,103])
# index can be overwritten
print(s)

100    a
101    b
102    c
103    d
dtype: object


In [10]:
data = {'a' : 0., 'b' : 1., 'c' : 2.}
s = pd.Series(data,index=['b','c','d','a'])
print(s)
# the missing element is filled with NaN at d

b    1.0
c    2.0
d    NaN
a    0.0
dtype: float64


In [12]:
s = pd.Series(5, index=[0, 1, 2, 3])
print(s)

0    5
1    5
2    5
3    5
dtype: int64


In [34]:
s = pd.Series([1,2,3,4,5,6])
print(s[5])
print()
print(s[:2])
print()
print(s[-2:])
print()
print(s[[0,4]])

6

0    1
1    2
dtype: int64

4    5
5    6
dtype: int64

0    1
4    5
dtype: int64


<h3>Python Pandas - DataFrame</h3>

<img src ="https://www.tutorialspoint.com/python_pandas/images/structure_table.jpg">



pandas.DataFrame( data, index, columns, dtype, copy)

<table class="table table-bordered">
<tbody><tr>

<th style="text-align:center;">Parameter &amp; Description</th>
</tr>
<tr>

<td style="text-align:center;">
<p><b>data</b></p>
<p>data takes various forms like ndarray, series, map, lists, dict, constants and also another DataFrame.</p>
</td>
</tr>

<tr>

<td style="text-align:center;">
<p><b>columns</b></p>
<p>For column labels, the optional default syntax is - np.arrange(n). This is only true if no index is passed.</p>
</td>
</tr>


</tbody></table>

In [10]:
data = [1,2,3]
df = pd.DataFrame(data)
print (df)
print()

data = [['Alex',10],['Bob',12],['Clarke',13]]
df = pd.DataFrame(data,columns=['Name','Age'])
print(df)
print()

data = {'Name':['Tom', 'Jack', 'Steve', ],'Age':[28,34,29]}
df = pd.DataFrame(data, index=['rank1','rank2','rank3',])
print (df)

   0
0  1
1  2
2  3

     Name  Age
0    Alex   10
1     Bob   12
2  Clarke   13

        Name  Age
rank1    Tom   28
rank2   Jack   34
rank3  Steve   29


<h3>Python Pandas - Basic Functionality</h3>


<b>Series and DataFrame</b>
<table class="table table-bordered">
<tbody><tr>

<th style="text-align:center;">Attribute or
Method &amp; Description</th>
</tr>
<tr>

<td>
<p><b>axes</b></p>
<p>Returns a list of the row axis labels</p></td>
</tr>
<tr>

<td>
<p><b>dtype</b></p>
<p>Returns the dtype of the object.</p></td>
</tr>
<tr>

<td>
<p><b>empty</b></p>
<p>Returns True if series is empty.</p></td>
</tr>
<tr>

<td>
<p><b>ndim</b></p>
<p>Returns the number of dimensions of the underlying data, by
definition 1.</p></td>
</tr>
<tr>

<td>
<p><b>size</b></p>
<p>Returns the number of elements in the underlying data.</p></td>
</tr>
<tr>

<td>
<p><b>values</b></p>
<p>Returns the Series as ndarray.</p></td>
</tr>
<tr>

<td>
<p><b>head()</b></p>
<p>Returns the first n rows.</p></td>
</tr>
<tr>

<td>
<p><b>tail()</b></p>
<p>Returns the last n rows.</p></td>
</tr>
</tbody></table>
<b>DataFrame</b>
<table class="table table-bordered">
<tbody><tr>



<td>
<p><b>T</b></p>
<p>Transposes rows and columns.</p></td>
</tr>








</tbody></table>

In [12]:
s = pd.Series(np.random.randn(10))
print (s.axes)
print()
print (s.empty)
print()
print (s.ndim)
print()
print (s.size)
print()
print (s.values)
print()
print (s.head(1))
print()
print (s.tail(1))
print()
d = pd.DataFrame([[1,1,1],[2,2,2],[3,3,3]])
print(d.T)

[RangeIndex(start=0, stop=10, step=1)]

False

1

10

[ 0.71908025 -0.48830195 -1.68209854  0.65789081  0.21757745 -0.155688
 -0.32857113 -1.37585845 -0.20849954  0.88129518]

0    0.71908
dtype: float64

9    0.881295
dtype: float64

   0  1  2
0  1  2  3
1  1  2  3
2  1  2  3


<h3>Python Pandas - Descriptive Statistics</h3>
<table class="table table-bordered">
<tbody><tr>

<th style="text-align:center;">Function</th>
<th style="text-align:center;">Description</th>
</tr>
<tr>

<td style="text-align:center;">count()</td>
<td>Number of non-null observations</td>
</tr>
<tr>

<td style="text-align:center;">sum()</td>
<td>Sum of values</td>
</tr>
<tr>

<td style="text-align:center;">mean()</td>
<td>Mean of Values</td>
</tr>
<tr>

<td style="text-align:center;">median()</td>
<td>Median of Values</td>
</tr>
<tr>

<td style="text-align:center;">mode()</td>
<td>Mode of values</td>
</tr>
<tr>

<td style="text-align:center;">std()</td>
<td>Standard Deviation of the Values</td>
</tr>
<tr>

<td style="text-align:center;">min()</td>
<td>Minimum Value</td>
</tr>
<tr>

<td style="text-align:center;">max()</td>
<td>Maximum Value</td>
</tr>
<tr>

<td style="text-align:center;">abs()</td>
<td>Absolute Value</td>
</tr>
<tr>

<td style="text-align:center;">prod()</td>
<td>Product of Values</td>
</tr>
<tr>

<td style="text-align:center;">cumsum()</td>
<td>Cumulative Sum</td>
</tr>
<tr>

<td style="text-align:center;">cumprod()</td>
<td>Cumulative Product</td>
</tr>
</tbody></table>

In [22]:
d = {'Name':pd.Series(['Tom','James','Ricky','Vin','Steve','Smith','Jack',
   'Lee','David','Gasper','Betina','Andres']),
   'Age':pd.Series([25,26,25,23,30,29,23,34,40,30,51,46]),
   'Rating':pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,3.8,3.78,2.98,4.80,4.10,3.65])
}

#Create a DataFrame
df = pd.DataFrame(d)
print (df)
print()
print(df.sum())
print()
print(df.sum(1))

      Name  Age  Rating
0      Tom   25    4.23
1    James   26    3.24
2    Ricky   25    3.98
3      Vin   23    2.56
4    Steve   30    3.20
5    Smith   29    4.60
6     Jack   23    3.80
7      Lee   34    3.78
8    David   40    2.98
9   Gasper   30    4.80
10  Betina   51    4.10
11  Andres   46    3.65

Name      TomJamesRickyVinSteveSmithJackLeeDavidGasperBe...
Age                                                     382
Rating                                                44.92
dtype: object

0     29.23
1     29.24
2     28.98
3     25.56
4     33.20
5     33.60
6     26.80
7     37.78
8     42.98
9     34.80
10    55.10
11    49.65
dtype: float64


In [24]:
print (df.describe())
print()
print (df.describe(include='all'))


# object − Summarizes String columns
# number − Summarizes Numeric columns
# all − Summarizes all columns together 

             Age     Rating
count  12.000000  12.000000
mean   31.833333   3.743333
std     9.232682   0.661628
min    23.000000   2.560000
25%    25.000000   3.230000
50%    29.500000   3.790000
75%    35.500000   4.132500
max    51.000000   4.800000

         Name        Age     Rating
count      12  12.000000  12.000000
unique     12        NaN        NaN
top     Ricky        NaN        NaN
freq        1        NaN        NaN
mean      NaN  31.833333   3.743333
std       NaN   9.232682   0.661628
min       NaN  23.000000   2.560000
25%       NaN  25.000000   3.230000
50%       NaN  29.500000   3.790000
75%       NaN  35.500000   4.132500
max       NaN  51.000000   4.800000


<h3>Python Pandas - Function Application</h3>

<b>Table wise Function Application: pipe()
</b>


Custom operations can be performed by passing the function and the appropriate number of parameters as pipe arguments. Thus, operation is performed on the whole DataFrame.


<b>Row or Column Wise Function Application: apply()
</b>


Arbitrary functions can be applied along the axes of a DataFrame or Panel using the apply() method, which, like the descriptive statistics methods, takes an optional axis argument. By default, the operation performs column wise, taking each column as an array-like.


<b>Element wise Function Application: applymap()
</b>

the methods applymap() on DataFrame and analogously map() on Series accept any Python function taking a single value and returning a single value.



In [38]:
df = pd.DataFrame([[1,2,3,4,5,6,7,8,9],[10,20,30,40,50,60,70,80,90]])
print( df.applymap(lambda x:x**2) )

print(df.apply(np.mean,axis=1))

     0    1    2     3     4     5     6     7     8
0    1    4    9    16    25    36    49    64    81
1  100  400  900  1600  2500  3600  4900  6400  8100
0     5.0
1    50.0
dtype: float64


<h3>Python Pandas - Reindexing And Renaming</h3>

<b>Reindex

In [40]:
N=20

df = pd.DataFrame({
   'A': pd.date_range(start='2016-01-01',periods=N,freq='D'),
   'x': np.linspace(0,stop=N-1,num=N),
   'y': np.random.rand(N),
   'C': np.random.choice(['Low','Medium','High'],N).tolist(),
   'D': np.random.normal(100, 10, size=(N)).tolist()
})

#reindex the DataFrame
df_reindexed = df.reindex(index=[0,2,5], columns=['A', 'C', 'B'])
print(df)
print (df_reindexed)

            A     x         y       C           D
0  2016-01-01   0.0  0.436940    High   78.562058
1  2016-01-02   1.0  0.308300  Medium   97.271276
2  2016-01-03   2.0  0.198512  Medium   90.828990
3  2016-01-04   3.0  0.878558  Medium   95.858641
4  2016-01-05   4.0  0.863035    High  111.391191
5  2016-01-06   5.0  0.109871    High   97.761229
6  2016-01-07   6.0  0.661311    High  112.144401
7  2016-01-08   7.0  0.423101  Medium  113.035207
8  2016-01-09   8.0  0.718770     Low  106.908970
9  2016-01-10   9.0  0.009904    High   86.266387
10 2016-01-11  10.0  0.323459  Medium  108.430143
11 2016-01-12  11.0  0.544658  Medium  105.899885
12 2016-01-13  12.0  0.725288    High  122.968484
13 2016-01-14  13.0  0.677373     Low   98.538545
14 2016-01-15  14.0  0.690644    High  100.546431
15 2016-01-16  15.0  0.415890    High  110.468204
16 2016-01-17  16.0  0.446759  Medium   93.017073
17 2016-01-18  17.0  0.581544     Low  101.931303
18 2016-01-19  18.0  0.124389    High  100.850237


<b>reindex_like and rename

In [50]:
df1 = pd.DataFrame(np.random.randn(10,3),columns=['col1','col2','col3'])
df2 = pd.DataFrame(np.random.randn(7,3),columns=['col1','col2','col30'])
print(df1)
df1 = df1.reindex_like(df2)
print (df1)
print( df1.rename(columns={'col1' : 'c1', 'col2' : 'c2'}))

       col1      col2      col3
0 -0.599265  0.345364  1.267003
1  0.646699 -0.170542  1.065252
2  0.069017 -0.674890 -0.216596
3 -2.217099  1.363995 -0.730604
4  1.284265 -0.725696 -1.109991
5 -1.673772  0.015104  0.055695
6 -0.339400  0.128017 -0.960818
7  0.431201 -0.960983 -0.953654
8 -0.230118  0.345102 -0.856678
9 -1.672235 -1.690922 -1.056215
       col1      col2  col30
0 -0.599265  0.345364    NaN
1  0.646699 -0.170542    NaN
2  0.069017 -0.674890    NaN
3 -2.217099  1.363995    NaN
4  1.284265 -0.725696    NaN
5 -1.673772  0.015104    NaN
6 -0.339400  0.128017    NaN
         c1        c2  col30
0 -0.599265  0.345364    NaN
1  0.646699 -0.170542    NaN
2  0.069017 -0.674890    NaN
3 -2.217099  1.363995    NaN
4  1.284265 -0.725696    NaN
5 -1.673772  0.015104    NaN
6 -0.339400  0.128017    NaN


<h3>Python Pandas - Iteration</h3>


basic iteration (for i in object) produces −

Series − values

DataFrame − column labels

<br><br>

To iterate over the rows of the DataFrame, we can use the following functions −

iteritems() − to iterate over the (key,value) pairs

iterrows() − iterate over the rows as (index,series) pairs

itertuples() − iterate over the rows as namedtuples

In [64]:
df = pd.DataFrame([[1,2,3],['a','b','c']],columns=['col1','col2','col3'],index = [0,1])
print(df)
print()
for key,value in df.iteritems():
    print (key)
    print(value)
print()
for key,value in df.iterrows():
    print (key)
    print(value)
print()
for key in df.itertuples():
    print (key)
   

  col1 col2 col3
0    1    2    3
1    a    b    c

col1
0    1
1    a
Name: col1, dtype: object
col2
0    2
1    b
Name: col2, dtype: object
col3
0    3
1    c
Name: col3, dtype: object

0
col1    1
col2    2
col3    3
Name: 0, dtype: object
1
col1    a
col2    b
col3    c
Name: 1, dtype: object

Pandas(Index=0, col1=1, col2=2, col3=3)
Pandas(Index=1, col1='a', col2='b', col3='c')


<h3>Python Pandas - Sorting</h3>

By Label - sort_index(): only sees the index or column names or values and sorts it 

Args : ascending=False , axis=1

By Value - sort_values(): sees the values in a row or a column and sorts the values

In [85]:
df = pd.DataFrame([[1,10],[10,3],[9,50],[2,40]],columns=['col1','col2'],index=[3,2,1,4])
print(df)
print()
print(df.sort_index())
print()
print(df.sort_values(by = ['col1','col2']))

   col1  col2
3     1    10
2    10     3
1     9    50
4     2    40

   col1  col2
1     9    50
2    10     3
3     1    10
4     2    40

   col1  col2
3     1    10
4     2    40
1     9    50
2    10     3


<h3>Python Pandas - Indexing and Selecting Data</h3>
<table class="table table-bordered">
<tbody><tr>

<th style="text-align:center;">Indexing &amp; Description</th>
</tr>
<tr>

<td>
<p><b>.loc()</b></p>
<p>Label based</p></td>
</tr>
<tr>

<td>
<p><b>.iloc()</b></p>
<p>Integer based</p></td>
</tr>
<tr>

<td>
<p><b>.ix()</b></p>
<p>Both Label and Integer based</p></td>
</tr>
</tbody></table>

In [113]:
print(df)
print()
print(df['col1'])
print()
print (df.loc[[3,4],'col2'])
print()
print(df.iloc[:2,1])
# these are 0-based indexing.

   col1  col2
3     1    10
2    10     3
1     9    50
4     2    40

3     1
2    10
1     9
4     2
Name: col1, dtype: int64

3    10
4    40
Name: col2, dtype: int64

3    10
2     3
Name: col2, dtype: int64


<h3>Python Pandas - Statistical Functions</h3>

df.Percent_change() : This function compares every element with its prior element and computes the change percentage.

sr1.Correlation(sr2) : Correlation shows the linear relationship between any two array of values 

<h3>Python Pandas - Window Functions and Aggregations

<h3>Python Pandas - Missing Data</h3>


<b>to check if it is an address is null or not</b>



.isnull() 

.notnull()
  
  
-- can be applied to columns too and the output will be element wise check on the column
  
  
<b>to fill the nan values in dataframe/series</b>


.fillna(0) # fills with 0

.fillna(method='pad')  # fills with approximate value like previous(pad/fill) or upcoming(bfill/backfill ) value


<b>to drop the row/column with nan</b>


.dropna()


.dropna(axis=1)


<b>to replace any value with some other value</b>


.replace({1000:10,2000:60})

<h3>Python Pandas - GroupBy</h3>

- obj.groupby(key,axis=1), .groupby(['Team','Year'])
- print df.groupby('Team').groups
- grouped.get_group(2014)

<h3>Python Pandas - Merging/Joining</h3>


pd.merge(left, right, how='inner', on=None, left_on=None, right_on=None,
left_index=False, right_index=False, sort=True)

left − A DataFrame object.

right − Another DataFrame object.

on − Columns (names) to join on. Must be found in both the left and right DataFrame objects.

left_on − Columns from the left DataFrame to use as keys. Can either be column names or arrays with length equal to the length of the DataFrame.

right_on − Columns from the right DataFrame to use as keys. Can either be column names or arrays with length equal to the length of the DataFrame.

left_index − If True, use the index (row labels) from the left DataFrame as its join key(s). In case of a DataFrame with a MultiIndex (hierarchical), the number of levels must match the number of join keys from the right DataFrame.

right_index − Same usage as left_index for the right DataFrame.

how − One of 'left', 'right', 'outer', 'inner'. Defaults to inner. Each method has been described below.

sort − Sort the result DataFrame by the join keys in lexicographical order. Defaults to True, setting to False will improve the performance substantially in many cases.

<table class="table table-bordered">
<tbody><tr>
<th style="text-align:center;">Merge Method</th>
<th style="text-align:center;">SQL Equivalent</th>
<th style="text-align:center;">Description</th>
</tr>
<tr>
<td style="text-align:center;">left</td>
<td style="text-align:center;">LEFT OUTER JOIN</td>
<td>Use keys from left object</td>
</tr>
<tr>
<td style="text-align:center;">right</td>
<td style="text-align:center;">RIGHT OUTER JOIN</td>
<td>Use keys from right object</td>
</tr>
<tr>
<td style="text-align:center;">outer</td>
<td style="text-align:center;">FULL OUTER JOIN</td>
<td>Use union of keys</td>
</tr>
<tr>
<td style="text-align:center;">inner</td>
<td style="text-align:center;">INNER JOIN</td>
<td>Use intersection of keys</td>
</tr>
</tbody></table>

<h3>Python Pandas - Concatenation</h3>

 pd.concat(objs,axis=0,join='outer',join_axes=None,
ignore_index=False)

- objs − This is a sequence or mapping of Series, DataFrame, or Panel objects.

- axis − {0, 1, ...}, default 0. This is the axis to concatenate along.

- join − {‘inner’, ‘outer’}, default ‘outer’. How to handle indexes on other axis(es). Outer for union and inner for intersection.

- ignore_index − boolean, default False. If True, do not use the index values on the concatenation axis. The resulting axis will be labeled 0, ..., n - 1.

- join_axes − This is the list of Index objects. Specific indexes to use for the other (n-1) axes instead of performing inner/outer set logic.



---
not completed https://www.tutorialspoint.com/python_pandas/
