In [90]:
!pip install statsmodels

Collecting statsmodels
  Downloading statsmodels-0.14.0-cp38-cp38-macosx_11_0_arm64.whl (9.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.3/9.3 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting scipy!=1.9.2,>=1.4 (from statsmodels)
  Using cached scipy-1.10.1-cp38-cp38-macosx_12_0_arm64.whl (28.8 MB)
Collecting patsy>=0.5.2 (from statsmodels)
  Using cached patsy-0.5.3-py2.py3-none-any.whl (233 kB)
Installing collected packages: scipy, patsy, statsmodels
Successfully installed patsy-0.5.3 scipy-1.10.1 statsmodels-0.14.0


In [1]:
import pandas as pd
import numpy as np

# Essential basic functionality 

In [2]:
index = pd.date_range("1/1/2000", periods=8)

s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"])

df = pd.DataFrame(np.random.randn(8, 3), index=index, columns=["A", "B", "C"])


# Attributes and underlying data

In [3]:
s.array

<PandasArray>
[  0.8196497023283962,   1.2911028589063331, -0.35547744395198044,
  0.14332205582486726,   -1.033172260353623]
Length: 5, dtype: float64

In [5]:
s.to_numpy()

array([ 0.8196497 ,  1.29110286, -0.35547744,  0.14332206, -1.03317226])

In [6]:
np.array(s)

array([ 0.8196497 ,  1.29110286, -0.35547744,  0.14332206, -1.03317226])

In [7]:
ser = pd.Series(pd.date_range("2000", periods=2, tz="CET"))

ser.to_numpy(dtype=object)

array([Timestamp('2000-01-01 00:00:00+0100', tz='CET'),
       Timestamp('2000-01-02 00:00:00+0100', tz='CET')], dtype=object)

In [8]:
ser.to_numpy(dtype="datetime64[ns]")

array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00.000000000'],
      dtype='datetime64[ns]')

In [9]:
df.to_numpy()

array([[-0.00833448, -0.65429495,  0.58418122],
       [ 0.61039683, -0.57389245,  1.11800247],
       [-0.88732152, -1.64775112, -1.08575416],
       [ 0.16871046,  2.08284189, -2.08508833],
       [ 0.32889739, -0.10918762,  0.30466808],
       [-0.03496041, -1.130815  ,  0.77447725],
       [-0.67129584,  0.04063606,  0.0244578 ],
       [-0.45135266, -0.21436038, -0.15818336]])

# Accelerated operations

pandas has support for accelerating certain types of binary numerical and boolean operations using the numexpr library and the bottleneck libraries.

These libraries are especially useful when dealing with large data sets, and provide large speedups. numexpr uses smart chunking, caching, and multiple cores. bottleneck is a set of specialized cython routines that are especially fast when dealing with arrays that have nans.



In [10]:
pd.set_option("compute.use_bottleneck", False)
pd.set_option("compute.use_numexpr", False)

# Flexible binary operations
## Matching / broadcasting behavior

In [102]:
df = pd.DataFrame(
    {
        "one": pd.Series(np.random.randn(3), index=["a", "b", "c"]),
        "two": pd.Series(np.random.randn(4), index=["a", "b", "c", "d"]),
        "three": pd.Series(np.random.randn(3), index=["b", "c", "d"]),
    }
)


df

Unnamed: 0,one,two,three
a,-0.894083,0.820709,
b,0.335993,0.406543,-0.540722
c,-2.505089,-0.20819,-0.667371
d,,-1.534677,-0.284604


In [13]:
row = df.iloc[1]
row

one      0.556735
two      2.136808
three    0.083718
Name: b, dtype: float64

In [14]:
column = df["two"]
column

a    2.413864
b    2.136808
c   -1.293691
d    0.912339
Name: two, dtype: float64

In [17]:
df.sub(row, axis='columns')
# Same than df.sub(row, axis=1)

Unnamed: 0,one,two,three
a,-1.492632,0.277056,
b,0.0,0.0,0.0
c,1.065216,-3.430498,1.541465
d,,-1.224469,-1.292357


In [19]:
df.sub(column, axis="index")
#same than df.sub(row, axis=0)

Unnamed: 0,one,two,three
a,-3.349761,0.0,
b,-1.580073,0.0,-2.05309
c,2.915642,0.0,2.918874
d,,0.0,-2.120978


In [21]:
dfmi = df.copy()
dfmi

Unnamed: 0,one,two,three
a,-0.935897,2.413864,
b,0.556735,2.136808,0.083718
c,1.621951,-1.293691,1.625183
d,,0.912339,-1.208639


In [22]:
dfmi.index = pd.MultiIndex.from_tuples(
                    [('1','a'), ('1','b'), ('1','c'), ('2','c')], names=['first', 'second'])

In [23]:
dfmi

Unnamed: 0_level_0,Unnamed: 1_level_0,one,two,three
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,a,-0.935897,2.413864,
1,b,0.556735,2.136808,0.083718
1,c,1.621951,-1.293691,1.625183
2,c,,0.912339,-1.208639


In [25]:
s = pd.Series(np.arange(10))
s

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int64

In [26]:
div, rem = divmod(s, 3)

In [27]:
div

0    0
1    0
2    0
3    1
4    1
5    1
6    2
7    2
8    2
9    3
dtype: int64

In [28]:
rem

0    0
1    1
2    2
3    0
4    1
5    2
6    0
7    1
8    2
9    0
dtype: int64

## Missing data / operations with fill values

In [29]:
df

Unnamed: 0,one,two,three
a,-0.935897,2.413864,
b,0.556735,2.136808,0.083718
c,1.621951,-1.293691,1.625183
d,,0.912339,-1.208639


In [32]:
df2 = df.copy()
df2

Unnamed: 0,one,two,three
a,-0.935897,2.413864,
b,0.556735,2.136808,0.083718
c,1.621951,-1.293691,1.625183
d,,0.912339,-1.208639


In [35]:
df.at['a','three'] = 1.0

In [36]:
df + df2

Unnamed: 0,one,two,three
a,-1.871795,4.827728,
b,1.11347,4.273615,0.167436
c,3.243903,-2.587382,3.250366
d,,1.824677,-2.417278


In [37]:
df.add(df2, fill_value=0)

Unnamed: 0,one,two,three
a,-1.871795,4.827728,1.0
b,1.11347,4.273615,0.167436
c,3.243903,-2.587382,3.250366
d,,1.824677,-2.417278


## Flexible comparainson

methods eq, ne, lt, gt, le, and ge

In [38]:
df.gt(df2)

Unnamed: 0,one,two,three
a,False,False,False
b,False,False,False
c,False,False,False
d,False,False,False


In [39]:
df.ne(df2)

Unnamed: 0,one,two,three
a,False,False,True
b,False,False,False
c,False,False,False
d,True,False,False


## Boolean reduction

In [40]:
(df > 0).all()

one      False
two      False
three    False
dtype: bool

In [41]:
(df > 0).any()

one      True
two      True
three    True
dtype: bool

In [42]:
(df > 0).any().any()

True

In [43]:
df.empty


False

In [44]:
pd.DataFrame(index=list('ABC')).empty

True

## Comparing if objects are equivalent

In [45]:
df + df == df * 2

Unnamed: 0,one,two,three
a,True,True,True
b,True,True,True
c,True,True,True
d,False,True,True


In [46]:
np.nan == np.nan

False

In [48]:
(df + df).equals(df * 2)

True

## Comparing array-like objects

In [49]:
pd.Series(["foo", "bar", "baz"]) == "foo"

0     True
1    False
2    False
dtype: bool

In [50]:
pd.Index(["foo", "bar", "baz"]) == "foo"

array([ True, False, False])

In [51]:
pd.Series(["foo", "bar", "baz"]) == pd.Index(["foo", "bar", "qux"])


0     True
1     True
2    False
dtype: bool

## Descriptive statistics

In [52]:
df

Unnamed: 0,one,two,three
a,-0.935897,2.413864,1.0
b,0.556735,2.136808,0.083718
c,1.621951,-1.293691,1.625183
d,,0.912339,-1.208639


In [53]:
series = pd.Series(np.random.randn(500))

series[20:500] = np.nan

series[10:20] = 5

In [54]:
series.describe(percentiles=[0.05, 0.25, 0.75, 0.95])

count    20.000000
mean      2.812316
std       2.287752
min      -0.279285
5%       -0.229999
25%       0.737206
50%       3.229386
75%       5.000000
95%       5.000000
max       5.000000
dtype: float64

In [55]:
s = pd.Series(["a", "a", "b", "b", "a", "a", np.nan, "c", "d", "a"])

s.describe()

count     9
unique    4
top       a
freq      5
dtype: object

## Discretization and quantiling

In [58]:
arr = np.random.randn(20)
arr

array([ 0.83331473, -1.34821152,  0.18184354,  1.08130754,  0.97827751,
       -0.27518267, -1.32307626,  0.09552411,  0.64732714, -0.6125712 ,
       -0.70848636, -0.34304512, -1.32770696, -0.33662094, -1.36880012,
        0.14292033, -1.2144521 ,  1.89707941, -0.72403955, -0.49118564])

In [59]:
factor = pd.cut(arr, 4)
factor

[(0.264, 1.081], (-1.372, -0.552], (-0.552, 0.264], (1.081, 1.897], (0.264, 1.081], ..., (-0.552, 0.264], (-1.372, -0.552], (1.081, 1.897], (-1.372, -0.552], (-0.552, 0.264]]
Length: 20
Categories (4, interval[float64, right]): [(-1.372, -0.552] < (-0.552, 0.264] < (0.264, 1.081] < (1.081, 1.897]]

In [61]:
factor = pd.cut(arr, [-5, -1, 0, 1, 5])
factor

[(0, 1], (-5, -1], (0, 1], (1, 5], (0, 1], ..., (0, 1], (-5, -1], (1, 5], (-1, 0], (-1, 0]]
Length: 20
Categories (4, interval[int64, right]): [(-5, -1] < (-1, 0] < (0, 1] < (1, 5]]

In [75]:
arr = np.random.randint(0,15,30000)
arr

array([14,  5,  7, ...,  5,  0,  9])

In [76]:
factor = pd.qcut(arr, [0, 0.25, 0.5, 0.75, 1])
factor

[(11.0, 14.0], (3.0, 7.0], (3.0, 7.0], (11.0, 14.0], (11.0, 14.0], ..., (3.0, 7.0], (3.0, 7.0], (3.0, 7.0], (-0.001, 3.0], (7.0, 11.0]]
Length: 30000
Categories (4, interval[float64, right]): [(-0.001, 3.0] < (3.0, 7.0] < (7.0, 11.0] < (11.0, 14.0]]

# Function application

- Tablewise Function Application: pipe()

- Row or Column-wise Function Application: apply()

- Aggregation API: agg() and transform()

- Applying Elementwise Functions: applymap()

## Tablewise function application

In [79]:
def extract_city_name(df):
    """
    Chicago, IL -> Chicago for city_name column
    """
    df["city_name"] = df["city_and_code"].str.split(",").str.get(0)
    return df


def add_country_name(df, country_name=None):
    """
    Chicago -> Chicago-US for city_name column
    """
    col = "city_name"
    df["city_and_country"] = df[col] + country_name
    return df


df_p = pd.DataFrame({"city_and_code": ["Chicago, IL"]})

In [78]:
add_country_name(extract_city_name(df_p), country_name="US")

Unnamed: 0,city_and_code,city_name,city_and_country
0,"Chicago, IL",Chicago,ChicagoUS


In [80]:
df_p

Unnamed: 0,city_and_code
0,"Chicago, IL"


In [83]:
df_p.pipe(extract_city_name) \
    .pipe(add_country_name, country_name="US")

Unnamed: 0,city_and_code,city_name,city_and_country
0,"Chicago, IL",Chicago,ChicagoUS


pandas encourages the second style, which is known as method chaining. pipe makes it easy to use your own or another library’s functions in method chains, alongside pandas’ methods.

In [94]:
import statsmodels.formula.api as sm

path_baseball = '/Users/xavierpivan/Projets/training/pandas/pandas/doc/data/baseball.csv'
bb = pd.read_csv(path_baseball)

bb.head()

Unnamed: 0,id,player,year,stint,team,lg,g,ab,r,h,...,rbi,sb,cs,bb,so,ibb,hbp,sh,sf,gidp
0,88641,womacto01,2006,2,CHN,NL,19,50,6,14,...,2.0,1.0,1.0,4,4.0,0.0,0.0,3.0,0.0,0.0
1,88643,schilcu01,2006,1,BOS,AL,31,2,0,1,...,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0
2,88645,myersmi01,2006,1,NYA,AL,62,0,0,0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,88649,helliri01,2006,1,MIL,NL,20,3,0,0,...,0.0,0.0,0.0,0,2.0,0.0,0.0,0.0,0.0,0.0
4,88650,johnsra05,2006,1,NYA,AL,33,6,0,1,...,0.0,0.0,0.0,0,4.0,0.0,0.0,0.0,0.0,0.0


In [93]:
(
    bb.query("h > 0")
    .assign(ln_h=lambda df: np.log(df.h))
    .pipe((sm.ols, "data"), "hr ~ ln_h + year + g + C(lg)")
    .fit()
    .summary()
)

0,1,2,3
Dep. Variable:,hr,R-squared:,0.685
Model:,OLS,Adj. R-squared:,0.665
Method:,Least Squares,F-statistic:,34.28
Date:,"Fri, 09 Jun 2023",Prob (F-statistic):,3.48e-15
Time:,11:49:23,Log-Likelihood:,-205.92
No. Observations:,68,AIC:,421.8
Df Residuals:,63,BIC:,432.9
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-8484.7720,4664.146,-1.819,0.074,-1.78e+04,835.780
C(lg)[T.NL],-2.2736,1.325,-1.716,0.091,-4.922,0.375
ln_h,-1.3542,0.875,-1.547,0.127,-3.103,0.395
year,4.2277,2.324,1.819,0.074,-0.417,8.872
g,0.1841,0.029,6.258,0.000,0.125,0.243

0,1,2,3
Omnibus:,10.875,Durbin-Watson:,1.999
Prob(Omnibus):,0.004,Jarque-Bera (JB):,17.298
Skew:,0.537,Prob(JB):,0.000175
Kurtosis:,5.225,Cond. No.,14900000.0


Investigate the pipe() doc https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.pipe.html#pandas.DataFrame.pipe

In [95]:
bb.query('h > 0')

Unnamed: 0,id,player,year,stint,team,lg,g,ab,r,h,...,rbi,sb,cs,bb,so,ibb,hbp,sh,sf,gidp
0,88641,womacto01,2006,2,CHN,NL,19,50,6,14,...,2.0,1.0,1.0,4,4.0,0.0,0.0,3.0,0.0,0.0
1,88643,schilcu01,2006,1,BOS,AL,31,2,0,1,...,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0
4,88650,johnsra05,2006,1,NYA,AL,33,6,0,1,...,0.0,0.0,0.0,0,4.0,0.0,0.0,0.0,0.0,0.0
5,88652,finlest01,2006,1,SFN,NL,139,426,66,105,...,40.0,7.0,0.0,46,55.0,2.0,2.0,3.0,4.0,6.0
6,88653,gonzalu01,2006,1,ARI,NL,153,586,93,159,...,73.0,0.0,1.0,69,58.0,10.0,7.0,0.0,6.0,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93,89521,bondsba01,2007,1,SFN,NL,126,340,75,94,...,66.0,5.0,0.0,132,54.0,43.0,3.0,0.0,2.0,13.0
94,89523,biggicr01,2007,1,HOU,NL,141,517,68,130,...,50.0,4.0,3.0,23,112.0,0.0,3.0,7.0,5.0,5.0
97,89530,ausmubr01,2007,1,HOU,NL,117,349,38,82,...,25.0,6.0,1.0,37,74.0,3.0,6.0,4.0,1.0,11.0
98,89533,aloumo01,2007,1,NYN,NL,87,328,51,112,...,49.0,3.0,0.0,27,30.0,5.0,2.0,0.0,3.0,13.0


In [99]:
bb.query('h <= 0')

Unnamed: 0,id,player,year,stint,team,lg,g,ab,r,h,...,rbi,sb,cs,bb,so,ibb,hbp,sh,sf,gidp
2,88645,myersmi01,2006,1,NYA,AL,62,0,0,0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,88649,helliri01,2006,1,MIL,NL,20,3,0,0,...,0.0,0.0,0.0,0,2.0,0.0,0.0,0.0,0.0,0.0
11,89333,witasja01,2007,1,TBA,AL,3,0,0,0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
13,89335,wickmbo01,2007,2,ARI,NL,8,0,0,0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
14,89336,wickmbo01,2007,1,ATL,NL,47,0,0,0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
16,89338,whiteri01,2007,1,HOU,NL,20,1,0,0,...,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0
19,89341,weathda01,2007,1,CIN,NL,67,0,0,0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
21,89345,wakefti01,2007,1,BOS,AL,1,2,0,0,...,0.0,0.0,0.0,0,2.0,0.0,0.0,0.0,0.0,0.0
23,89348,villoro01,2007,1,NYA,AL,6,0,0,0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
26,89355,trachst01,2007,1,BAL,AL,3,5,0,0,...,0.0,0.0,0.0,0,3.0,0.0,0.0,0.0,0.0,0.0


## Row or column-wise function application

In [101]:
df

Unnamed: 0,id,player,year,stint,team,lg,g,ab,r,h,...,rbi,sb,cs,bb,so,ibb,hbp,sh,sf,gidp
0,88641,womacto01,2006,2,CHN,NL,19,50,6,14,...,2.0,1.0,1.0,4,4.0,0.0,0.0,3.0,0.0,0.0
1,88643,schilcu01,2006,1,BOS,AL,31,2,0,1,...,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0
2,88645,myersmi01,2006,1,NYA,AL,62,0,0,0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,88649,helliri01,2006,1,MIL,NL,20,3,0,0,...,0.0,0.0,0.0,0,2.0,0.0,0.0,0.0,0.0,0.0
4,88650,johnsra05,2006,1,NYA,AL,33,6,0,1,...,0.0,0.0,0.0,0,4.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,89525,benitar01,2007,2,FLO,NL,34,0,0,0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
96,89526,benitar01,2007,1,SFN,NL,19,0,0,0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
97,89530,ausmubr01,2007,1,HOU,NL,117,349,38,82,...,25.0,6.0,1.0,37,74.0,3.0,6.0,4.0,1.0,11.0
98,89533,aloumo01,2007,1,NYN,NL,87,328,51,112,...,49.0,3.0,0.0,27,30.0,5.0,2.0,0.0,3.0,13.0


In [104]:
df.apply(np.mean)

one     -1.021060
two     -0.128904
three   -0.497566
dtype: float64

In [106]:
df.apply(np.mean, axis=1)

a   -0.036687
b    0.067271
c   -1.126883
d   -0.909640
dtype: float64

In [107]:
df.apply(lambda x: x.max() - x.min())

one      2.841082
two      2.355386
three    0.382768
dtype: float64