In [1]:
import numpy as np
import pandas as pd

In [2]:
sales = pd.Series([100, 200, 100, 400])
print(sales.values)

[100 200 100 400]


In [3]:
print(sales.index)

RangeIndex(start=0, stop=4, step=1)


In [4]:
sales = pd.Series([100, 200, 100, 400], index = ['a', 'b', 'c', 'd'])
print(sales)

a    100
b    200
c    100
d    400
dtype: int64


In [5]:
print(sales.index)

Index(['a', 'b', 'c', 'd'], dtype='object')


In [6]:
sales = pd.Series([100, 200, 100, 400], index = ['Jan', 'Feb', 'Mar', 'Apr'], name = "4 Months Sales")
print(sales)

Jan    100
Feb    200
Mar    100
Apr    400
Name: 4 Months Sales, dtype: int64


In [7]:
canteen = pd.Series([300, 400, 100, 700, 300, 200, 200], index = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'], name = "Sandwiches Sold in One Week")
print(canteen)

Mon    300
Tue    400
Wed    100
Thu    700
Fri    300
Sat    200
Sun    200
Name: Sandwiches Sold in One Week, dtype: int64


In [8]:
print(canteen[1])

400


In [10]:
print(canteen["Tue"])

400


In [11]:
print(canteen[[3, 5]])

Thu    700
Sat    200
Name: Sandwiches Sold in One Week, dtype: int64


In [12]:
print(canteen[["Tue", "Fri"]])

Tue    400
Fri    300
Name: Sandwiches Sold in One Week, dtype: int64


In [13]:
print(canteen[canteen > 200])

Mon    300
Tue    400
Thu    700
Fri    300
Name: Sandwiches Sold in One Week, dtype: int64


In [16]:
canteen = canteen // 2
print(canteen)

Mon     75
Tue    100
Wed     25
Thu    175
Fri     75
Sat     50
Sun     50
Name: Sandwiches Sold in One Week, dtype: int64


In [17]:
"Fri" in canteen

True

In [18]:
200 in canteen

False

In [19]:
50 in canteen

False

In [21]:
arr = np.array([3, 2, 4, 5, 6])
ind = np.array(['a', 'b', 'c', 'd', 'e'])

In [22]:
obj2 = pd.Series(arr, index = ind)
obj2

a    3
b    2
c    4
d    5
e    6
dtype: int32

In [24]:
state = {
    'Sindh': 350,
    'Punjab': 300,
    'Kpk': 200,
    'Balochistan': 400
}

tax_by_state = pd.Series(state)
tax_by_state

Sindh          350
Punjab         300
Kpk            200
Balochistan    400
dtype: int64

In [25]:
print(tax_by_state.index)

Index(['Sindh', 'Punjab', 'Kpk', 'Balochistan'], dtype='object')


In [28]:
tax_by_state = pd.Series(state, index = ['Sindh', 'Kpk', 'Punjab', 'Balochistan'])
tax_by_state

Sindh          350
Kpk            200
Punjab         300
Balochistan    400
dtype: int64

In [29]:
tax_by_state = pd.Series(state, index = ['Sindh', 'Kpk', 'Punjab', 'Balochistan', 'Kashmir'])
tax_by_state

Sindh          350.0
Kpk            200.0
Punjab         300.0
Balochistan    400.0
Kashmir          NaN
dtype: float64

In [30]:
print(pd.isnull(tax_by_state))

Sindh          False
Kpk            False
Punjab         False
Balochistan    False
Kashmir         True
dtype: bool


In [31]:
tax_by_state.isnull()

Sindh          False
Kpk            False
Punjab         False
Balochistan    False
Kashmir         True
dtype: bool

In [32]:
tax_by_state.name = "State Tax Paying"

In [33]:
tax_by_state

Sindh          350.0
Kpk            200.0
Punjab         300.0
Balochistan    400.0
Kashmir          NaN
Name: State Tax Paying, dtype: float64

In [34]:
tax_by_state.index.name = "States Name"
tax_by_state

States Name
Sindh          350.0
Kpk            200.0
Punjab         300.0
Balochistan    400.0
Kashmir          NaN
Name: State Tax Paying, dtype: float64

In [2]:
# index should be same for merging two or more series to make dataframe
import pandas as pd
apples = pd.Series([3, 2, 0, 1])
oranges = pd.Series([3, 2, 0, 1])

print(apples, oranges)

0    3
1    2
2    0
3    1
dtype: int64 0    3
1    2
2    0
3    1
dtype: int64


In [3]:
data = {
    "apples": apples,
    "oranges": oranges
}

fruit_df = pd.DataFrame(data)

In [4]:
print(fruit_df)

   apples  oranges
0       3        3
1       2        2
2       0        0
3       1        1


In [5]:
data = {
    'State': ['Ohio', 'Nevada', 'Ohio', 'Ohio'],
    'Year': [2000, 1999, 1998, 2017],
    'Population': [1.5, 1.8, 3.6, 8.4]
}
frame = pd.DataFrame(data)

In [6]:
print(frame)

    State  Year  Population
0    Ohio  2000         1.5
1  Nevada  1999         1.8
2    Ohio  1998         3.6
3    Ohio  2017         8.4


In [7]:
frame.index = ['1st', '2nd', '3rd', '4th']

In [8]:
print(frame)

      State  Year  Population
1st    Ohio  2000         1.5
2nd  Nevada  1999         1.8
3rd    Ohio  1998         3.6
4th    Ohio  2017         8.4


In [9]:
frame.head()

Unnamed: 0,State,Year,Population
1st,Ohio,2000,1.5
2nd,Nevada,1999,1.8
3rd,Ohio,1998,3.6
4th,Ohio,2017,8.4


In [10]:
frame.head(1)

Unnamed: 0,State,Year,Population
1st,Ohio,2000,1.5


In [11]:
frame.columns = ['state', 'year', 'pop']
frame

Unnamed: 0,state,year,pop
1st,Ohio,2000,1.5
2nd,Nevada,1999,1.8
3rd,Ohio,1998,3.6
4th,Ohio,2017,8.4


In [12]:
frame2 = pd.DataFrame(data, 
                      columns = ['State', 'Year', 'Population', 'Debt'], 
                      index = ['One', 'Two', 'Three', 'Four']
                     )
frame2.head(2)

Unnamed: 0,State,Year,Population,Debt
One,Ohio,2000,1.5,
Two,Nevada,1999,1.8,


In [13]:
frame2.columns

Index(['State', 'Year', 'Population', 'Debt'], dtype='object')

In [14]:
frame2.index

Index(['One', 'Two', 'Three', 'Four'], dtype='object')

In [15]:
frame2["State"]

One        Ohio
Two      Nevada
Three      Ohio
Four       Ohio
Name: State, dtype: object

In [17]:
frame2.State

One        Ohio
Two      Nevada
Three      Ohio
Four       Ohio
Name: State, dtype: object

In [18]:
frame2.loc["Three"]

State         Ohio
Year          1998
Population     3.6
Debt           NaN
Name: Three, dtype: object

In [19]:
frame2.loc[:, "Year"]

One      2000
Two      1999
Three    1998
Four     2017
Name: Year, dtype: int64

In [20]:
# Function Application and Mapping

In [21]:
import numpy as np
import pandas as pd

frame = pd.DataFrame(np.random.randn(4, 3),
                     columns = list('bde'),
                     index = ['Utah', 'Ohio', 'Texas', 'Oregon']
                    )

In [22]:
frame.head()

Unnamed: 0,b,d,e
Utah,-1.901876,1.787733,2.163605
Ohio,1.616466,-0.253824,-0.505162
Texas,0.259587,0.497387,0.002481
Oregon,-0.033011,0.483074,-0.195014


In [27]:
print(frame)
print(np.abs(frame))

               b         d         e
Utah   -1.901876  1.787733  2.163605
Ohio    1.616466 -0.253824 -0.505162
Texas   0.259587  0.497387  0.002481
Oregon -0.033011  0.483074 -0.195014
               b         d         e
Utah    1.901876  1.787733  2.163605
Ohio    1.616466  0.253824  0.505162
Texas   0.259587  0.497387  0.002481
Oregon  0.033011  0.483074  0.195014


In [31]:
print("Min: ", frame.min())
print("Min in d: ", frame["d"].min())
print("Max in d: ", frame["d"].max())
print(frame["d"].max() - frame["d"].min())

Min:  b   -1.901876
d   -0.253824
e   -0.505162
dtype: float64
Min in d:  -0.25382380393265214
Max in d:  1.7877331877968925
2.0415569917295446


In [32]:
# lambda means we don't want to assign any name to that function
f = lambda x: x.max() - x.min()
df = frame.apply(f)
df.head()

b    3.518342
d    2.041557
e    2.668768
dtype: float64

In [33]:
print(df, type(df))

b    3.518342
d    2.041557
e    2.668768
dtype: float64 <class 'pandas.core.series.Series'>


In [34]:
df = frame.apply(f, axis = 1)
print(df)

Utah      4.065482
Ohio      2.121628
Texas     0.494907
Oregon    0.678088
dtype: float64


In [35]:
# axis = 1 => row wise 

In [36]:
def min_max(x):
    minimum = x.min()
    maximum = x.max()
    return pd.Series([minimum, maximum], index = ['Min', 'Max'])

df = frame.apply(min_max)
print(df, type(df))

            b         d         e
Min -1.901876 -0.253824 -0.505162
Max  1.616466  1.787733  2.163605 <class 'pandas.core.frame.DataFrame'>


In [38]:
data_df = pd.DataFrame(
    np.arange(16).reshape((4, 4)),
    index = ["Ohio", "Colorado", "Utah", "New York"],
    columns = ["One", "Two", "Three", "Four"]
)
print(data_df)

          One  Two  Three  Four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15


In [39]:
# arange =>> function create numerical array

In [41]:
print(data_df.loc['Colorado', 'Two'])

5


In [42]:
print(data_df.loc[["Colorado", "Utah"], ["Two", "Three"]])

          Two  Three
Colorado    5      6
Utah        9     10


In [43]:
# if want to use index instead of labels
print(data_df.iloc[2, [3, 0, 1]])

Four    11
One      8
Two      9
Name: Utah, dtype: int32


In [44]:
print(data_df.iloc[[1, 2], [1, 2]])

          Two  Three
Colorado    5      6
Utah        9     10


In [46]:
# Arithmetic and Data Alignment

df1 = pd.DataFrame(
    np.arange(9.).reshape((3, 3)),
    columns = list('bcd'),
    index = ["Ohio", "Texas", "Colorado"]
)

df2 = pd.DataFrame(
    np.arange(12.).reshape((4, 3)),
    columns = list('bde'),
    index = ["Utah", "Ohio", "Texas", "Oregon"]
)

print(df1)
print(df2)


            b    c    d
Ohio      0.0  1.0  2.0
Texas     3.0  4.0  5.0
Colorado  6.0  7.0  8.0
          b     d     e
Utah    0.0   1.0   2.0
Ohio    3.0   4.0   5.0
Texas   6.0   7.0   8.0
Oregon  9.0  10.0  11.0


In [47]:
df3 = df1 + df2
df3.head()

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [51]:
df3.replace(np.nan, 3, inplace = True)

In [52]:
df3.head()

Unnamed: 0,b,c,d,e
Colorado,3.0,3.0,3.0,3.0
Ohio,3.0,3.0,6.0,3.0
Oregon,3.0,3.0,3.0,3.0
Texas,9.0,3.0,12.0,3.0
Utah,3.0,3.0,3.0,3.0


In [58]:
df1 = pd.DataFrame(
    np.arange(9.).reshape((3, 3)),
    columns = list('bcd'),
    index = ["Ohio", "Texas", "Colorado"]
)

df2 = pd.DataFrame(
    np.arange(12.).reshape((4, 3)),
    columns = list('bcd'),
    index = ["Utah", "Ohio", "Texas", "Oregon"]
)

In [59]:
df3 = df1.add(df2, fill_value = 0)

In [60]:
df3.head()

Unnamed: 0,b,c,d
Colorado,6.0,7.0,8.0
Ohio,3.0,5.0,7.0
Oregon,9.0,10.0,11.0
Texas,9.0,11.0,13.0
Utah,0.0,1.0,2.0


In [64]:
state = ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada']
data = {
    "state": state,
    "year": [2000, 2001, 2002, 2001, 2002, 2003],
    "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]
}
frame2 = pd.DataFrame(
    data,
    columns = ["year", "state", "pop", "debt"],
    index = ["one", "two", "three", "four", "five", "six"]
)

print(len(frame2))
ln = len(frame2)

6


In [65]:
rng = np.arange(ln)
print(rng)

[0 1 2 3 4 5]


In [66]:
frame2['debt'] = rng
frame2.head()

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0
two,2001,Ohio,1.7,1
three,2002,Ohio,3.6,2
four,2001,Nevada,2.4,3
five,2002,Nevada,2.9,4


In [68]:
val = pd.Series(
    [-1.2, -1.5, -1.7, -1.2, -1.5, -1.7],
    index = ['one', 'two', 'three', 'four', 'five', 'six']
)
frame2['debt'] = val

frame2.head(6)

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,-1.2
two,2001,Ohio,1.7,-1.5
three,2002,Ohio,3.6,-1.7
four,2001,Nevada,2.4,-1.2
five,2002,Nevada,2.9,-1.5
six,2003,Nevada,3.2,-1.7
