In [2]:
import math
import statistics as stats2

from matplotlib import pyplot as plt
from matplotlib.ticker import MaxNLocator
from scipy import stats
import numpy as np
import pandas as pd


%matplotlib inline

In [3]:
people = pd.DataFrame({
    "name": [ "Bob", "Sally", "Timmy", "Bridget", "Susan", "Steve" ],
    "city": [ "San Jose", "San Diego", "New York", "San Jose", "Austin", "New York" ],
    "state": [ "CA", "CA", "NY", "CA", "TX", "NY" ]
})

people

Unnamed: 0,name,city,state
0,Bob,San Jose,CA
1,Sally,San Diego,CA
2,Timmy,New York,NY
3,Bridget,San Jose,CA
4,Susan,Austin,TX
5,Steve,New York,NY


In [4]:
people.value_counts()

name     city       state
Bob      San Jose   CA       1
Bridget  San Jose   CA       1
Sally    San Diego  CA       1
Steve    New York   NY       1
Susan    Austin     TX       1
Timmy    New York   NY       1
dtype: int64

In [5]:
people["city"].value_counts()

San Jose     2
New York     2
San Diego    1
Austin       1
Name: city, dtype: int64

In [6]:
people[["city", "state"]].value_counts()

city       state
New York   NY       2
San Jose   CA       2
Austin     TX       1
San Diego  CA       1
dtype: int64

In [7]:
people.groupby([ "city", "state" ]).size()

city       state
Austin     TX       1
New York   NY       2
San Diego  CA       1
San Jose   CA       2
dtype: int64

In [9]:
people_grouped = people.groupby([ "city", "state" ]).size().sort_values(ascending=False)

people_grouped

city       state
New York   NY       2
San Jose   CA       2
Austin     TX       1
San Diego  CA       1
dtype: int64

In [10]:
type(people_grouped)

pandas.core.series.Series

In [11]:
people_grouped.unstack()

state,CA,NY,TX
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Austin,,,1.0
New York,,2.0,
San Diego,1.0,,
San Jose,2.0,,


In [13]:
state_by_city = people_grouped.unstack(0)
state_by_city

city,Austin,New York,San Diego,San Jose
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,,,1.0,2.0
NY,,2.0,,
TX,1.0,,,


In [15]:
state_by_city.fillna(0, inplace=True)
state_by_city

city,Austin,New York,San Diego,San Jose
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,0.0,0.0,1.0,2.0
NY,0.0,2.0,0.0,0.0
TX,1.0,0.0,0.0,0.0


In [16]:
state_by_city.sort_values(by=["New York"], ascending=False)

city,Austin,New York,San Diego,San Jose
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NY,0.0,2.0,0.0,0.0
CA,0.0,0.0,1.0,2.0
TX,1.0,0.0,0.0,0.0


In [17]:
import random

days = pd.Series({
    "1": random.random(),
    "2": random.random(),
    "5": random.random(),
    "7": random.random(),
    "8": random.random(),
    "9": random.random(),
})

display(days)

1    0.737450
2    0.842549
5    0.592881
7    0.714434
8    0.473816
9    0.771226
dtype: float64

In [18]:
days.index

Index(['1', '2', '5', '7', '8', '9'], dtype='object')

In [19]:
reindexed_days = days.reindex([ str(v) for v in range(1,11) ])

reindexed_days

1     0.737450
2     0.842549
3          NaN
4          NaN
5     0.592881
6          NaN
7     0.714434
8     0.473816
9     0.771226
10         NaN
dtype: float64

In [20]:
reindexed_days.fillna(0, inplace=True)

reindexed_days


1     0.737450
2     0.842549
3     0.000000
4     0.000000
5     0.592881
6     0.000000
7     0.714434
8     0.473816
9     0.771226
10    0.000000
dtype: float64

In [22]:
df= pd.DataFrame({'number': np.random.randint(1, 100, 100)})

df

Unnamed: 0,number
0,79
1,97
2,15
3,58
4,37
...,...
95,2
96,25
97,51
98,75


In [23]:
df['bins'] = pd.cut(x=df['number'], bins=[1, 20, 40, 60, 80, 100])

print(df)
print(df['bins'].unique())
print(df['bins'].value_counts())

    number       bins
0       79   (60, 80]
1       97  (80, 100]
2       15    (1, 20]
3       58   (40, 60]
4       37   (20, 40]
..     ...        ...
95       2    (1, 20]
96      25   (20, 40]
97      51   (40, 60]
98      75   (60, 80]
99       9    (1, 20]

[100 rows x 2 columns]
[(60.0, 80.0], (80.0, 100.0], (1.0, 20.0], (40.0, 60.0], (20.0, 40.0], NaN]
Categories (5, interval[int64, right]): [(1, 20] < (20, 40] < (40, 60] < (60, 80] < (80, 100]]
(1, 20]      26
(40, 60]     25
(20, 40]     22
(60, 80]     14
(80, 100]    12
Name: bins, dtype: int64


In [25]:
df = pd.DataFrame({'team': ['A', 'A', 'A', 'A', 'B', 'B', 'B', 'B'],
                   'points': [10, 10, 12, 12, 15, 17, 20, 20],
                   'assists': [5, 5, 7, 9, 12, 9, 6, 6]})

df

Unnamed: 0,team,points,assists
0,A,10,5
1,A,10,5
2,A,12,7
3,A,12,9
4,B,15,12
5,B,17,9
6,B,20,6
7,B,20,6


In [26]:
df[df.duplicated]

Unnamed: 0,team,points,assists
1,A,10,5
7,B,20,6


In [27]:
df[df.duplicated(keep='last')]

Unnamed: 0,team,points,assists
0,A,10,5
6,B,20,6


In [28]:
df.drop_duplicates()

Unnamed: 0,team,points,assists
0,A,10,5
2,A,12,7
3,A,12,9
4,B,15,12
5,B,17,9
6,B,20,6


In [29]:
df.drop_duplicates(keep='last')

Unnamed: 0,team,points,assists
1,A,10,5
2,A,12,7
3,A,12,9
4,B,15,12
5,B,17,9
7,B,20,6


In [30]:
df.drop_duplicates(subset=['team'])

Unnamed: 0,team,points,assists
0,A,10,5
4,B,15,12
