In [1]:
#Pandas is  a major tool for ML.
# It contains data structures and data manipulation tools designed to make data cleaning and analysis fast and convenient
#pandas is designed for working with tabular or heterogeneous data.
#Numpy ia best suited for working with homogeneously typed numerical array data.

In [4]:
import numpy as np

import pandas as pd

In [8]:
from pandas import Series,DataFrame
#series and dataframe has their own local names

In [9]:
#Series
#Series is a one-dimensional array-like object containing a sequence of values
obj = pd.Series([4,7,-5,3])

obj

0    4
1    7
2   -5
3    3
dtype: int64

In [12]:

obj.array

<NumpyExtensionArray>
[4, 7, -5, 3]
Length: 4, dtype: int64

In [13]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [14]:
#Often, you'll want to create a Series with an index identifying each data point with a label
#because You use an index in a Pandas Series to label each data point, making the data easier to understand, access, and 

obj2 = pd.Series([4,7,-5,3], index=["d","b","a","c"])

obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [20]:
obj2.index


Index(['d', 'b', 'a', 'c'], dtype='object')

In [21]:
#As you can see its range from d to c
obj2["a"]

np.int64(-5)

In [24]:
obj2["d"] = 6

In [25]:
obj2[["c","a","d"]]

c    3
a   -5
d    6
dtype: int64

In [26]:
#Using numpy functons will preserve the index value
np.exp(obj2)

d     403.428793
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [31]:
#you can create  a Series from dictionary
sdata = {"Ohio": 35000, "Texas": 71000, "Oregon": 16000, "Utah": 5000}

obj3 = pd.Series(sdata)
#series
obj3


Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [32]:
#dictionary
sdata

{'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

In [33]:
obj3.to_dict()

{'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

In [37]:
states = ["California", "Ohio", "Oregon", "Texas"]

obj4= pd.Series(sdata, index=states)
obj4
#as you can see keys and values are matching with themselves according to sdata ignoring states and california is nan because it is not in sdata

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [38]:
pd.isna(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [40]:
pd.notna(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [41]:
#also can use
obj4.isna()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [44]:
#Both the Series object itself and its index have a name attribute, which integrates with other areas of pandas functionality:
obj4.name = "population"

obj3.index.name ="state"

obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [46]:
df = obj4.to_frame()
print(df)


            population
California         NaN
Ohio           35000.0
Oregon         16000.0
Texas          71000.0


In [47]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [49]:
obj.index = ["Bob", "Steve", "Jeff", "Ryan"]

obj

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

In [50]:
#Dataframe
#A DataFrame represents a rectangular table of data and contains an ordered, named collection of columns, each of which can be a different value type (numeric, string, Boolean, etc.)
#The DataFrame has both a row and column index
#it can be thought of as a dictionary of Series all sharing the same index
data = {"state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"],
        "year": [2000, 2001, 2002, 2001, 2002, 2003],
        "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)

frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [53]:
#For large DataFrames, the head method selects only the first five rows:
frame.head()
#Similarly, tail returns the last five rows:

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [54]:
#If you specify a sequence of columns, the DataFrame’s columns will be arranged in that order:
pd.DataFrame(data,columns=["year","state","pop"])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [55]:
#If you pass a column that isn’t contained in the dictionary, it will appear with missing values in the result:
frame2 = pd.DataFrame(data, columns=["year", "state", "pop", "debt"])
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,
5,2003,Nevada,3.2,


In [56]:
frame2["state"]

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object

In [57]:
frame2.state

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object

In [58]:
#Rows can also be retrieved by position or name with the special iloc and loc attributes
frame2.loc[1]

year     2001
state    Ohio
pop       1.7
debt      NaN
Name: 1, dtype: object

In [59]:
frame2.iloc[1]

year     2001
state    Ohio
pop       1.7
debt      NaN
Name: 1, dtype: object

In [61]:
frame2["debt"] = 16.5

frame2
#as you can see I assigned 16.5 to the empty column

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,16.5
1,2001,Ohio,1.7,16.5
2,2002,Ohio,3.6,16.5
3,2001,Nevada,2.4,16.5
4,2002,Nevada,2.9,16.5
5,2003,Nevada,3.2,16.5


In [64]:
#or you can do this
val = pd.Series([-1.2,-1.5,-1.7], index=[2,4,5])

frame2["debt"] = val

frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,-1.2
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,-1.5
5,2003,Nevada,3.2,-1.7


In [65]:
#del keyword deletes dolumns

frame2["eastern"] = frame2["state"] == "Ohio"

frame2

Unnamed: 0,year,state,pop,debt,eastern
0,2000,Ohio,1.5,,True
1,2001,Ohio,1.7,,True
2,2002,Ohio,3.6,-1.2,True
3,2001,Nevada,2.4,,False
4,2002,Nevada,2.9,-1.5,False
5,2003,Nevada,3.2,-1.7,False


In [66]:
del frame2["eastern"]

In [67]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [68]:
#another common form of data is a nested dictionary of dictionaries
populations = {"Ohio": {2000: 1.5, 2001: 1.7, 2002: 3.6},"Nevada": {2001: 2.4, 2002: 2.9}}

frame3 = pd.DataFrame(populations)

frame3

Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9


In [69]:
#you can transpose it
frame3.T

Unnamed: 0,2000,2001,2002
Ohio,1.5,1.7,3.6
Nevada,,2.4,2.9


In [72]:
#you can change the which indexes you want to show
pd.DataFrame(populations,index=[2000,2001,2002,2003,2004,2005])

Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9
2003,,
2004,,
2005,,


In [73]:
#If a DataFrame’s index and columns have their name attributes set, these will also be displayed:
frame3.index.name ="year"

frame3.columns.name="state"
frame3

state,Ohio,Nevada
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9
