# DataFrames Intro

In [2]:
import pandas as pd


In [12]:
nba = pd.read_csv('data/nba.csv')

In [13]:
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,


In [5]:
nba.index

RangeIndex(start=0, stop=458, step=1)

In [6]:
nba.values

array([['Avery Bradley', 'Boston Celtics', 0.0, ..., 180.0, 'Texas',
        7730337.0],
       ['Jae Crowder', 'Boston Celtics', 99.0, ..., 235.0, 'Marquette',
        6796117.0],
       ['John Holland', 'Boston Celtics', 30.0, ..., 205.0,
        'Boston University', nan],
       ...,
       ['Tibor Pleiss', 'Utah Jazz', 21.0, ..., 256.0, nan, 2900000.0],
       ['Jeff Withey', 'Utah Jazz', 24.0, ..., 231.0, 'Kansas', 947276.0],
       [nan, nan, nan, ..., nan, nan, nan]], dtype=object)

In [7]:
# python tuple (rows, columns)
nba.shape

(458, 9)

In [8]:
# get data type of each series
nba.dtypes

Name         object
Team         object
Number      float64
Position     object
Age         float64
Height       object
Weight      float64
College      object
Salary      float64
dtype: object

In [9]:
# get names of each series
nba.columns

Index(['Name', 'Team', 'Number', 'Position', 'Age', 'Height', 'Weight',
       'College', 'Salary'],
      dtype='object')

In [10]:
nba.axes

[RangeIndex(start=0, stop=458, step=1),
 Index(['Name', 'Team', 'Number', 'Position', 'Age', 'Height', 'Weight',
        'College', 'Salary'],
       dtype='object')]

In [11]:
# gives all the information about a dataframe
nba.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      457 non-null    object 
 1   Team      457 non-null    object 
 2   Number    457 non-null    float64
 3   Position  457 non-null    object 
 4   Age       457 non-null    float64
 5   Height    457 non-null    object 
 6   Weight    457 non-null    float64
 7   College   373 non-null    object 
 8   Salary    446 non-null    float64
dtypes: float64(4), object(5)
memory usage: 32.3+ KB


In [58]:
rev = pd.read_csv('data/revenue.csv')
rev.head()

Unnamed: 0,Date,New York,Los Angeles,Miami
0,1/1/16,985,122,499
1,1/2/16,738,788,534
2,1/3/16,14,20,933
3,1/4/16,730,904,885
4,1/5/16,114,71,253


In [23]:
rev.sum()

Date           1/1/161/2/161/3/161/4/161/5/161/6/161/7/161/8/...
New York                                                    5475
Los Angeles                                                 5134
Miami                                                       5641
dtype: object

In [21]:
# sum across each row
rev.sum(axis=1)
rev.sum(axis="columns")

0    1606
1    2060
2     967
3    2519
4     438
5    1935
6    1234
7    2313
8    2623
9     555
dtype: int64

In [25]:
# select one column
# just use name of the series (if no spaces in name)
nba.Age.head()

0    25.0
1    25.0
2    27.0
3    22.0
4    29.0
Name: Age, dtype: float64

In [26]:
# bracket syntax
nba["Age"].head(3)


0    25.0
1    25.0
2    27.0
Name: Age, dtype: float64

In [32]:
# multiple series
nba[["Team", "Name"]].head()

Unnamed: 0,Team,Name
0,Boston Celtics,Avery Bradley
1,Boston Celtics,Jae Crowder
2,Boston Celtics,John Holland
3,Boston Celtics,R.J. Hunter
4,Boston Celtics,Jonas Jerebko


## adding a new column

In [33]:
# set each element of new series to "Basketball"
nba["Sport"] = "Basketball"
nba.head()


Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Sport
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0,Basketball
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0,Basketball
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,,Basketball
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0,Basketball
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0,Basketball


In [42]:
# drop column/s
nba.drop(columns= ["Sport"], index=1, inplace=True)
nba.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 457 entries, 0 to 457
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      456 non-null    object 
 1   Team      456 non-null    object 
 2   Number    456 non-null    float64
 3   Position  456 non-null    object 
 4   Age       456 non-null    float64
 5   Height    456 non-null    object 
 6   Weight    456 non-null    float64
 7   College   372 non-null    object 
 8   Salary    445 non-null    float64
dtypes: float64(4), object(5)
memory usage: 35.7+ KB


In [43]:
# with insert method, you can specify the location
nba.insert(0, "Sport", "Basketball")
nba.head()

Unnamed: 0,Sport,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Basketball,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
2,Basketball,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,Basketball,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Basketball,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
5,Basketball,Amir Johnson,Boston Celtics,90.0,PF,29.0,6-9,240.0,,12000000.0


In [44]:
# broadcasting operations
nba["Weight kg"] = nba.Weight * 0.453592
nba.head()

Unnamed: 0,Sport,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Weight kg
0,Basketball,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0,81.64656
2,Basketball,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,,92.98636
3,Basketball,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0,83.91452
4,Basketball,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0,104.779752
5,Basketball,Amir Johnson,Boston Celtics,90.0,PF,29.0,6-9,240.0,,12000000.0,108.86208


In [45]:
nba["Salary (millions)"] = nba.Salary / 1000000
nba.head()

Unnamed: 0,Sport,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Weight kg,Salary (millions)
0,Basketball,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0,81.64656,7.730337
2,Basketball,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,,92.98636,
3,Basketball,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0,83.91452,1.14864
4,Basketball,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0,104.779752,5.0
5,Basketball,Amir Johnson,Boston Celtics,90.0,PF,29.0,6-9,240.0,,12000000.0,108.86208,12.0


In [46]:
# count by Team
nba["Team"].value_counts()

New Orleans Pelicans      19
Memphis Grizzlies         18
New York Knicks           16
Milwaukee Bucks           16
Brooklyn Nets             15
Oklahoma City Thunder     15
Atlanta Hawks             15
Golden State Warriors     15
Dallas Mavericks          15
Phoenix Suns              15
Charlotte Hornets         15
Houston Rockets           15
Cleveland Cavaliers       15
Detroit Pistons           15
San Antonio Spurs         15
Chicago Bulls             15
Sacramento Kings          15
Portland Trail Blazers    15
Los Angeles Lakers        15
Denver Nuggets            15
Toronto Raptors           15
Washington Wizards        15
Philadelphia 76ers        15
Utah Jazz                 15
Indiana Pacers            15
Los Angeles Clippers      15
Miami Heat                15
Boston Celtics            14
Orlando Magic             14
Minnesota Timberwolves    14
Name: Team, dtype: int64

In [47]:
nba.tail()

Unnamed: 0,Sport,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Weight kg,Salary (millions)
453,Basketball,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0,92.079176,2.433333
454,Basketball,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0,81.192968,0.9
455,Basketball,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0,116.119552,2.9
456,Basketball,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0,104.779752,0.947276
457,Basketball,,,,,,,,,,,


In [61]:
# drop rows with any na values
nba.dropna(how="all").tail()

Unnamed: 0,Sport,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Weight kg,Salary (millions)
453,Basketball,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0,92.079176,2.433333
454,Basketball,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0,81.192968,0.9
455,Basketball,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0,116.119552,2.9
456,Basketball,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0,104.779752,0.947276
457,Basketball,,,,,,,,,,,


In [63]:
nba.dropna(how="any").head()

Unnamed: 0,Sport,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Weight kg,Salary (millions)
0,Basketball,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0,81.64656,7.730337
3,Basketball,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0,83.91452,1.14864
6,Basketball,Jordan Mickey,Boston Celtics,55.0,PF,21.0,6-8,235.0,LSU,1170960.0,106.59412,1.17096
7,Basketball,Kelly Olynyk,Boston Celtics,41.0,C,25.0,7-0,238.0,Gonzaga,2165160.0,107.954896,2.16516
8,Basketball,Terry Rozier,Boston Celtics,12.0,PG,22.0,6-2,190.0,Louisville,1824360.0,86.18248,1.82436


In [76]:
nba = pd.read_csv('data/nba.csv')

# drop any columns that have na
# use another variable or use inplace
x = nba.dropna(how="all")
x.dropna(axis=1, how="any").head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0


In [77]:
# remove row if na in specific column
nba.dropna(subset = ["Salary"]).head()
# John Holland is removed

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
5,Amir Johnson,Boston Celtics,90.0,PF,29.0,6-9,240.0,,12000000.0


## fillna method

In [2]:
nba = pd.read_csv('data/nba.csv')

# will replace all missing values with 0, not ideal for str values
nba.fillna(0).head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,0.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,0,5000000.0


In [4]:
# user fillna on a series instead
nba['Salary'].fillna(0, inplace=True)
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,0.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


In [5]:
nba = pd.read_csv('data/nba.csv')
nba['College'].fillna("", inplace=True)
nba.head()



Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


## astype

In [14]:
nba = pd.read_csv('data/nba.csv')
nba.dropna(how="all", inplace=True)
nba['Salary'].fillna(0, inplace=True)
nba['College'].fillna("", inplace=True)
nba.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 457 entries, 0 to 456
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      457 non-null    object 
 1   Team      457 non-null    object 
 2   Number    457 non-null    float64
 3   Position  457 non-null    object 
 4   Age       457 non-null    float64
 5   Height    457 non-null    object 
 6   Weight    457 non-null    float64
 7   College   457 non-null    object 
 8   Salary    457 non-null    float64
dtypes: float64(4), object(5)
memory usage: 35.7+ KB


In [16]:
nba["Salary"] = nba["Salary"].astype("int")
nba.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 457 entries, 0 to 456
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      457 non-null    object 
 1   Team      457 non-null    object 
 2   Number    457 non-null    float64
 3   Position  457 non-null    object 
 4   Age       457 non-null    float64
 5   Height    457 non-null    object 
 6   Weight    457 non-null    float64
 7   College   457 non-null    object 
 8   Salary    457 non-null    int64  
dtypes: float64(3), int64(1), object(5)
memory usage: 35.7+ KB


In [18]:
nba["Number"] = nba["Number"].astype("int")
nba["Age"] = nba["Age"].astype("int")
nba.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 457 entries, 0 to 456
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      457 non-null    object 
 1   Team      457 non-null    object 
 2   Number    457 non-null    int64  
 3   Position  457 non-null    object 
 4   Age       457 non-null    int64  
 5   Height    457 non-null    object 
 6   Weight    457 non-null    float64
 7   College   457 non-null    object 
 8   Salary    457 non-null    int64  
dtypes: float64(1), int64(3), object(5)
memory usage: 35.7+ KB


In [19]:
# five unique values
nba["Position"].nunique()


5

In [20]:
nba["Position"] = nba["Position"].astype("category")
nba.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 457 entries, 0 to 456
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   Name      457 non-null    object  
 1   Team      457 non-null    object  
 2   Number    457 non-null    int64   
 3   Position  457 non-null    category
 4   Age       457 non-null    int64   
 5   Height    457 non-null    object  
 6   Weight    457 non-null    float64 
 7   College   457 non-null    object  
 8   Salary    457 non-null    int64   
dtypes: category(1), float64(1), int64(3), object(4)
memory usage: 52.8+ KB


In [21]:
nba["Team"] = nba["Team"].astype("category")
nba.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 457 entries, 0 to 456
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   Name      457 non-null    object  
 1   Team      457 non-null    category
 2   Number    457 non-null    int64   
 3   Position  457 non-null    category
 4   Age       457 non-null    int64   
 5   Height    457 non-null    object  
 6   Weight    457 non-null    float64 
 7   College   457 non-null    object  
 8   Salary    457 non-null    int64   
dtypes: category(2), float64(1), int64(3), object(3)
memory usage: 51.1+ KB


## sorting


In [3]:
nba = pd.read_csv('data/nba.csv')
# by default na_position is last, but can change to first
nba.sort_values(by="Name", ascending=True, na_position="last")
nba.head()


Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


In [6]:
# sort by multiple series
nba.sort_values(by=["Team", "Name"], ascending=[True, False]).head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
322,Walter Tavares,Atlanta Hawks,22.0,C,24.0,7-3,260.0,,1000000.0
310,Tim Hardaway Jr.,Atlanta Hawks,10.0,SG,24.0,6-6,205.0,Michigan,1304520.0
321,Tiago Splitter,Atlanta Hawks,11.0,C,31.0,6-11,245.0,,9756250.0
320,Thabo Sefolosha,Atlanta Hawks,25.0,SF,32.0,6-7,220.0,,4000000.0
315,Paul Millsap,Atlanta Hawks,4.0,PF,31.0,6-8,246.0,Louisiana Tech,18671659.0


## sort_index

In [7]:
# change sort of dataframe
nba = pd.read_csv('data/nba.csv')
nba.sort_values(by="Name", inplace=True)
nba.head() #


Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
152,Aaron Brooks,Chicago Bulls,0.0,PG,31.0,6-0,161.0,Oregon,2250000.0
356,Aaron Gordon,Orlando Magic,0.0,PF,20.0,6-9,220.0,Arizona,4171680.0
328,Aaron Harrison,Charlotte Hornets,9.0,SG,21.0,6-6,210.0,Kentucky,525093.0
404,Adreian Payne,Minnesota Timberwolves,33.0,PF,25.0,6-10,237.0,Michigan State,1938840.0
312,Al Horford,Atlanta Hawks,15.0,C,30.0,6-10,245.0,Florida,12000000.0


In [8]:
# get back the original order
nba.sort_index(inplace=True)
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


In [12]:
nba = pd.read_csv('data/nba.csv').dropna(how="all")
nba['Salary'] = nba['Salary'].fillna(0).astype("int")
nba['SalaryRank'] = nba['Salary'].rank(ascending=False).astype("int")
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,SalaryRank
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337,97
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117,110
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,0,452
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640,322
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000,147


In [14]:
nba.sort_values("Salary", ascending=False).head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,SalaryRank
109,Kobe Bryant,Los Angeles Lakers,24.0,SF,37.0,6-6,212.0,,25000000,1
169,LeBron James,Cleveland Cavaliers,23.0,SF,31.0,6-8,250.0,,22970500,2
33,Carmelo Anthony,New York Knicks,7.0,SF,32.0,6-8,240.0,Syracuse,22875000,3
251,Dwight Howard,Houston Rockets,12.0,C,30.0,6-11,265.0,,22359364,4
339,Chris Bosh,Miami Heat,1.0,PF,32.0,6-11,235.0,Georgia Tech,22192730,5
