# Data Wrangling with Pandas - prepare the data

In [1]:
import pandas as pd

In [2]:
# read data from a pickle file
fires = pd.read_pickle('fires_cleaned.pkl')
fires.head()

Unnamed: 0,fire_name,fire_year,state,discovery_date,contain_date,acres_burned
16,POWER,2004,CA,2004-10-06,2004-10-21,16823.0
17,FREDS,2004,CA,2004-10-13,2004-10-17,7700.0
25,BACHELOR,2004,NM,2004-07-20,2004-07-20,10.0
37,HOWARD GAP,2005,NC,2005-01-27,2005-01-28,50.3
39,AUSTIN CREEK,2005,NC,2005-02-12,2005-02-13,125.0


In [3]:
fires.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 274123 entries, 16 to 1880441
Data columns (total 6 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   fire_name       128667 non-null  object        
 1   fire_year       274123 non-null  int64         
 2   state           274123 non-null  category      
 3   discovery_date  274123 non-null  datetime64[ns]
 4   contain_date    137376 non-null  datetime64[ns]
 5   acres_burned    274123 non-null  float64       
dtypes: category(1), datetime64[ns](2), float64(1), int64(1), object(1)
memory usage: 12.8+ MB


### Working with string, numeric, and datetime columns

In [4]:
# add a new column (as object data type)
fires['state_obj'] = fires['state'].astype(object)

In [5]:
fires.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 274123 entries, 16 to 1880441
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   fire_name       128667 non-null  object        
 1   fire_year       274123 non-null  int64         
 2   state           274123 non-null  category      
 3   discovery_date  274123 non-null  datetime64[ns]
 4   contain_date    137376 non-null  datetime64[ns]
 5   acres_burned    274123 non-null  float64       
 6   state_obj       274123 non-null  object        
dtypes: category(1), datetime64[ns](2), float64(1), int64(1), object(2)
memory usage: 14.9+ MB


In [6]:
# create a new column that combining fire_name and state_obj columns
fires['fire_name_state_obj'] = fires['fire_name'] + "_" + fires['state_obj']

In [7]:
fires.head()

Unnamed: 0,fire_name,fire_year,state,discovery_date,contain_date,acres_burned,state_obj,fire_name_state_obj
16,POWER,2004,CA,2004-10-06,2004-10-21,16823.0,CA,POWER_CA
17,FREDS,2004,CA,2004-10-13,2004-10-17,7700.0,CA,FREDS_CA
25,BACHELOR,2004,NM,2004-07-20,2004-07-20,10.0,NM,BACHELOR_NM
37,HOWARD GAP,2005,NC,2005-01-27,2005-01-28,50.3,NC,HOWARD GAP_NC
39,AUSTIN CREEK,2005,NC,2005-02-12,2005-02-13,125.0,NC,AUSTIN CREEK_NC


In [8]:
# create a new column that combining fire_name and state columns
fires['fire_name_state'] = fires['fire_name'] +"_"+ fires['state']

TypeError: Object with dtype category cannot perform the numpy op add

In [9]:
# convert the object columns to string data types first and combine
fires['fire_name_state'] = fires['fire_name'].astype(str) +"_"+ fires['state'].astype(str)

In [10]:
fires.head()

Unnamed: 0,fire_name,fire_year,state,discovery_date,contain_date,acres_burned,state_obj,fire_name_state_obj,fire_name_state
16,POWER,2004,CA,2004-10-06,2004-10-21,16823.0,CA,POWER_CA,POWER_CA
17,FREDS,2004,CA,2004-10-13,2004-10-17,7700.0,CA,FREDS_CA,FREDS_CA
25,BACHELOR,2004,NM,2004-07-20,2004-07-20,10.0,NM,BACHELOR_NM,BACHELOR_NM
37,HOWARD GAP,2005,NC,2005-01-27,2005-01-28,50.3,NC,HOWARD GAP_NC,HOWARD GAP_NC
39,AUSTIN CREEK,2005,NC,2005-02-12,2005-02-13,125.0,NC,AUSTIN CREEK_NC,AUSTIN CREEK_NC


In [11]:
fires[['fire_year','state','fire_name']].head()

Unnamed: 0,fire_year,state,fire_name
16,2004,CA,POWER
17,2004,CA,FREDS
25,2004,NM,BACHELOR
37,2005,NC,HOWARD GAP
39,2005,NC,AUSTIN CREEK


### Pandas dt accessor

* **year**
* **month**
* **day**
* **quarter**
* **days**
* **seconds**

In [12]:
# add a numeric month column derived from a datetime column
fires['fire_month'] = fires.discovery_date.dt.month

In [13]:
# add a numeric column derived from a datetime calculation
fires['days_burning'] = (fires.contain_date - fires.discovery_date).dt.days

In [14]:
fires['month_burning'] = fires['days_burning']/30
fires.head()

Unnamed: 0,fire_name,fire_year,state,discovery_date,contain_date,acres_burned,state_obj,fire_name_state_obj,fire_name_state,fire_month,days_burning,month_burning
16,POWER,2004,CA,2004-10-06,2004-10-21,16823.0,CA,POWER_CA,POWER_CA,10,15.0,0.5
17,FREDS,2004,CA,2004-10-13,2004-10-17,7700.0,CA,FREDS_CA,FREDS_CA,10,4.0,0.133333
25,BACHELOR,2004,NM,2004-07-20,2004-07-20,10.0,NM,BACHELOR_NM,BACHELOR_NM,7,0.0,0.0
37,HOWARD GAP,2005,NC,2005-01-27,2005-01-28,50.3,NC,HOWARD GAP_NC,HOWARD GAP_NC,1,1.0,0.033333
39,AUSTIN CREEK,2005,NC,2005-02-12,2005-02-13,125.0,NC,AUSTIN CREEK_NC,AUSTIN CREEK_NC,2,1.0,0.033333


### Pandas str accessor

* **count(str)**
* **lower(str)**
* **upper(str)**
* **title(str)**
* **lstrip(str)**
* **rstrip(str)**
* **strip(str)**
* **startswith(str)**
* **endswith(str)**
* **find(str)**
* **replace(old,new)**
* **join(sequence)**

In [15]:
# modify a column derived from string data
fires['fire_name'] = fires.fire_name.str.title()

In [16]:
# add a column derived from string data
fires['full_name'] = 'The ' + fires.fire_name + ' Fire ' \
                   + '(' + fires.fire_year.astype(str) + ')'

In [17]:
fires.head()

Unnamed: 0,fire_name,fire_year,state,discovery_date,contain_date,acres_burned,state_obj,fire_name_state_obj,fire_name_state,fire_month,days_burning,month_burning,full_name
16,Power,2004,CA,2004-10-06,2004-10-21,16823.0,CA,POWER_CA,POWER_CA,10,15.0,0.5,The Power Fire (2004)
17,Freds,2004,CA,2004-10-13,2004-10-17,7700.0,CA,FREDS_CA,FREDS_CA,10,4.0,0.133333,The Freds Fire (2004)
25,Bachelor,2004,NM,2004-07-20,2004-07-20,10.0,NM,BACHELOR_NM,BACHELOR_NM,7,0.0,0.0,The Bachelor Fire (2004)
37,Howard Gap,2005,NC,2005-01-27,2005-01-28,50.3,NC,HOWARD GAP_NC,HOWARD GAP_NC,1,1.0,0.033333,The Howard Gap Fire (2005)
39,Austin Creek,2005,NC,2005-02-12,2005-02-13,125.0,NC,AUSTIN CREEK_NC,AUSTIN CREEK_NC,2,1.0,0.033333,The Austin Creek Fire (2005)


In [18]:
# acres burned per day
fires['acres_per_day'] = fires.dropna().acres_burned / fires.dropna().days_burning

In [19]:
fires[['fire_name','full_name','acres_burned','days_burning','acres_per_day']].head()

Unnamed: 0,fire_name,full_name,acres_burned,days_burning,acres_per_day
16,Power,The Power Fire (2004),16823.0,15.0,1121.533333
17,Freds,The Freds Fire (2004),7700.0,4.0,1925.0
25,Bachelor,The Bachelor Fire (2004),10.0,0.0,inf
37,Howard Gap,The Howard Gap Fire (2005),50.3,1.0,50.3
39,Austin Creek,The Austin Creek Fire (2005),125.0,1.0,125.0


In [20]:
fires[['state','days_burning']].head()

Unnamed: 0,state,days_burning
16,CA,15.0
17,CA,4.0
25,NM,0.0
37,NC,1.0
39,NC,1.0


In [21]:
fires.shape

(274123, 14)

### Pandas transform() method

* **transform()** -> Adds summary values to each row.

Parameters:

* **func** -> The function to apply.
* **axis** -> 0 (the default) for rows; 1 for columns.

In [22]:
# add mean days_burning by state to each row
fires.groupby('state')['days_burning'].transform(func='mean')

16         5.387197
17         5.387197
25         6.085806
37         1.015474
39         1.015474
             ...   
1880387    5.387197
1880399    5.387197
1880411    5.387197
1880419    5.387197
1880441    5.387197
Name: days_burning, Length: 274123, dtype: float64

In [23]:
# add mean days_burning by state to each row
# add a summary column to a dataframe
fires['mean_days'] = fires.groupby('state')['days_burning'].transform(func='mean')
fires[['state','days_burning','mean_days']].head()

Unnamed: 0,state,days_burning,mean_days
16,CA,15.0,5.387197
17,CA,4.0,5.387197
25,NM,0.0,6.085806
37,NC,1.0,1.015474
39,NC,1.0,1.015474


In [24]:
fires['state_total_days_burning'] = fires.groupby('state')['days_burning'].transform('sum')
fires[['state','days_burning','mean_days','state_total_days_burning']].head()

Unnamed: 0,state,days_burning,mean_days,state_total_days_burning
16,CA,15.0,5.387197,36272.0
17,CA,4.0,5.387197,36272.0
25,NM,0.0,6.085806,20923.0
37,NC,1.0,1.015474,2100.0
39,NC,1.0,1.015474,2100.0


In [25]:
fires['days_burning_pct'] = fires['days_burning']/fires['state_total_days_burning']

In [26]:
fires.head()

Unnamed: 0,fire_name,fire_year,state,discovery_date,contain_date,acres_burned,state_obj,fire_name_state_obj,fire_name_state,fire_month,days_burning,month_burning,full_name,acres_per_day,mean_days,state_total_days_burning,days_burning_pct
16,Power,2004,CA,2004-10-06,2004-10-21,16823.0,CA,POWER_CA,POWER_CA,10,15.0,0.5,The Power Fire (2004),1121.533333,5.387197,36272.0,0.000414
17,Freds,2004,CA,2004-10-13,2004-10-17,7700.0,CA,FREDS_CA,FREDS_CA,10,4.0,0.133333,The Freds Fire (2004),1925.0,5.387197,36272.0,0.00011
25,Bachelor,2004,NM,2004-07-20,2004-07-20,10.0,NM,BACHELOR_NM,BACHELOR_NM,7,0.0,0.0,The Bachelor Fire (2004),inf,6.085806,20923.0,0.0
37,Howard Gap,2005,NC,2005-01-27,2005-01-28,50.3,NC,HOWARD GAP_NC,HOWARD GAP_NC,1,1.0,0.033333,The Howard Gap Fire (2005),50.3,1.015474,2100.0,0.000476
39,Austin Creek,2005,NC,2005-02-12,2005-02-13,125.0,NC,AUSTIN CREEK_NC,AUSTIN CREEK_NC,2,1.0,0.033333,The Austin Creek Fire (2005),125.0,1.015474,2100.0,0.000476


### Pandas apply() method: applies functions to rows or columns

* **apply(params)** -> Applies a function to the data in a row or a column and returns a Series.

Parameters:

* **function** -> The function that’s applied to each row or column. It can be a built-in function, a NumPy function, a userdefined function, or a lambda expression.
* **axis** -> The axis that the function is applied to: axis=0 (the default) for columns and axis=1 for rows.

In [27]:
# change the format of the percentage column
fires['days_burning_pct'] = fires['days_burning_pct'].apply(lambda x: format(x, '.2%'))
fires.head()

Unnamed: 0,fire_name,fire_year,state,discovery_date,contain_date,acres_burned,state_obj,fire_name_state_obj,fire_name_state,fire_month,days_burning,month_burning,full_name,acres_per_day,mean_days,state_total_days_burning,days_burning_pct
16,Power,2004,CA,2004-10-06,2004-10-21,16823.0,CA,POWER_CA,POWER_CA,10,15.0,0.5,The Power Fire (2004),1121.533333,5.387197,36272.0,0.04%
17,Freds,2004,CA,2004-10-13,2004-10-17,7700.0,CA,FREDS_CA,FREDS_CA,10,4.0,0.133333,The Freds Fire (2004),1925.0,5.387197,36272.0,0.01%
25,Bachelor,2004,NM,2004-07-20,2004-07-20,10.0,NM,BACHELOR_NM,BACHELOR_NM,7,0.0,0.0,The Bachelor Fire (2004),inf,6.085806,20923.0,0.00%
37,Howard Gap,2005,NC,2005-01-27,2005-01-28,50.3,NC,HOWARD GAP_NC,HOWARD GAP_NC,1,1.0,0.033333,The Howard Gap Fire (2005),50.3,1.015474,2100.0,0.05%
39,Austin Creek,2005,NC,2005-02-12,2005-02-13,125.0,NC,AUSTIN CREEK_NC,AUSTIN CREEK_NC,2,1.0,0.033333,The Austin Creek Fire (2005),125.0,1.015474,2100.0,0.05%


### More on apply method

In [28]:
workData = pd.read_pickle('workData.pkl')
workData.head(3)

Unnamed: 0_level_0,sex,region,wrkstat,hrs1,wkcontct,talkspvs,effctsup
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2,2,1,1.0,40.0,3.0,4.0,4.0
4,2,1,2.0,20.0,1.0,4.0,4.0
14,2,2,1.0,37.0,1.0,4.0,3.0


In [29]:
# apply() method applies a function to the data in a row or a column and returns a Series.
# axis=0 (the default) for columns and axis=1 for rows.
workData['Total'] = workData.apply('sum', axis=1)

In [30]:
workData.head()

Unnamed: 0_level_0,sex,region,wrkstat,hrs1,wkcontct,talkspvs,effctsup,Total
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2,2,1,1.0,40.0,3.0,4.0,4.0,55.0
4,2,1,2.0,20.0,1.0,4.0,4.0,34.0
14,2,2,1.0,37.0,1.0,4.0,3.0,50.0
19,1,1,1.0,50.0,1.0,3.0,4.0,61.0
21,2,1,1.0,38.0,1.0,4.0,4.0,51.0


In [31]:
workData.shape

(970, 8)

In [32]:
# get the mean of each row using a built-in Pandas function
workData.apply('mean')

sex          1.529897
region       5.184536
wrkstat      1.163918
hrs1        42.083505
wkcontct     2.796907
talkspvs     3.291753
effctsup     3.253608
Total       59.304124
dtype: float64

In [33]:
# apply a Numpy function to two columns to get their means
import numpy as np
workData[['sex','hrs1']].apply(np.mean)

sex      1.529897
hrs1    42.083505
dtype: float64

In [34]:
# get the row-wise mean of three columns
workData['avg_rating'] = workData[['wkcontct','talkspvs','effctsup']].apply(np.mean, axis=1)
workData.head(3)

Unnamed: 0_level_0,sex,region,wrkstat,hrs1,wkcontct,talkspvs,effctsup,Total,avg_rating
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2,2,1,1.0,40.0,3.0,4.0,4.0,55.0,3.666667
4,2,1,2.0,20.0,1.0,4.0,4.0,34.0,3.0
14,2,2,1.0,37.0,1.0,4.0,3.0,50.0,2.666667


### apply user-defined functions

In [35]:
def convert_sex(row):
    if row.sex == 1:
        return 'male'
    elif row.sex == 2:
        return 'female'
    else:
        return 'non-binary'
    
workData['sex'] = workData.apply(convert_sex, axis=1)
workData.head()

Unnamed: 0_level_0,sex,region,wrkstat,hrs1,wkcontct,talkspvs,effctsup,Total,avg_rating
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2,female,1,1.0,40.0,3.0,4.0,4.0,55.0,3.666667
4,female,1,2.0,20.0,1.0,4.0,4.0,34.0,3.0
14,female,2,1.0,37.0,1.0,4.0,3.0,50.0,2.666667
19,male,1,1.0,50.0,1.0,3.0,4.0,61.0,2.666667
21,female,1,1.0,38.0,1.0,4.0,4.0,51.0,3.0


### lambda expressions work with DataFrames

* **The if syntax**

    lambda arguments: return_value_if_true if condition

* **The if-else syntax**

    lambda arguments: return_value_if_true if condition else return_value_if_false

* **The if-elif-else syntax**

    lambda arguments: return_value_if_condition_1_true if condition_1 else (return_value_if_condition_2_true if condition_2 else return_value_if_condition_2_false)


In [36]:
df = pd.DataFrame([[0,1,2],[3,4,5]], columns=['col1','col2','col3'])
df

Unnamed: 0,col1,col2,col3
0,0,1,2
1,3,4,5


In [37]:
df.apply(lambda x: x.sum() * 2, axis=0) # column-wise

col1     6
col2    10
col3    14
dtype: int64

In [38]:
df.apply(lambda x: x.sum() * 2, axis=1) # row-wise

0     6
1    24
dtype: int64

#### How to apply lambda expressions

In [39]:
# if wrkstat=1.0 replace it with 'full-time' else 'part-time'
workData['wrkstat'] = workData.apply(lambda row: 'full-time' if row.wrkstat == 1.0 else 'part-time', axis=1)
workData.head()

Unnamed: 0_level_0,sex,region,wrkstat,hrs1,wkcontct,talkspvs,effctsup,Total,avg_rating
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2,female,1,full-time,40.0,3.0,4.0,4.0,55.0,3.666667
4,female,1,part-time,20.0,1.0,4.0,4.0,34.0,3.0
14,female,2,full-time,37.0,1.0,4.0,3.0,50.0,2.666667
19,male,1,full-time,50.0,1.0,3.0,4.0,61.0,2.666667
21,female,1,full-time,38.0,1.0,4.0,4.0,51.0,3.0


### What the SettingWithCopyWarning is warning you about

In [40]:
shots = pd.read_pickle('shot_cleaned.pkl')
shots.set_index('game_id', inplace=True)

In [41]:
shots.tail()

Unnamed: 0_level_0,player_name,period,minutes_remaining,seconds_remaining,event_type,action_type,shot_type,shot_distance,loc_x,loc_y,shot_attempted_flag,shot_made_flag,game_date,home_team,visiting_team
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
21801205,Stephen Curry,3,0,3,Made Shot,Step Back Jump shot,3PT Field Goal,30,-201,224,1,1,2019-04-07,GSW,LAC
21801215,Stephen Curry,1,9,49,Missed Shot,Step Back Jump shot,2PT Field Goal,18,-180,-15,1,0,2019-04-09,NOP,GSW
21801215,Stephen Curry,1,8,1,Made Shot,Jump Shot,3PT Field Goal,26,73,255,1,1,2019-04-09,NOP,GSW
21801215,Stephen Curry,1,6,36,Missed Shot,Jump Shot,3PT Field Goal,23,132,199,1,0,2019-04-09,NOP,GSW
21801215,Stephen Curry,1,2,43,Made Shot,Jump Shot,2PT Field Goal,12,-129,-15,1,1,2019-04-09,NOP,GSW


In [42]:
df = shots.copy(deep=True)

In [43]:
df.head(3)

Unnamed: 0_level_0,player_name,period,minutes_remaining,seconds_remaining,event_type,action_type,shot_type,shot_distance,loc_x,loc_y,shot_attempted_flag,shot_made_flag,game_date,home_team,visiting_team
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
20900015,Stephen Curry,1,11,25,Missed Shot,Jump Shot,3PT Field Goal,26,99,249,1,0,2009-10-28,GSW,HOU
20900015,Stephen Curry,1,9,31,Made Shot,Step Back Jump shot,2PT Field Goal,18,-122,145,1,1,2009-10-28,GSW,HOU
20900015,Stephen Curry,1,6,2,Missed Shot,Jump Shot,2PT Field Goal,14,-60,129,1,0,2009-10-28,GSW,HOU


### Generates the warning and corrupts the data

In [44]:
dfSlice = df.loc['0020900015',:]        # creates the slice (view)
dfSlice.loc[:,'player_name'] = 'Curry'  # modifies the slice

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfSlice.loc[:,'player_name'] = 'Curry'  # modifies the slice


In [45]:
df.head(3)

Unnamed: 0_level_0,player_name,period,minutes_remaining,seconds_remaining,event_type,action_type,shot_type,shot_distance,loc_x,loc_y,shot_attempted_flag,shot_made_flag,game_date,home_team,visiting_team
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
20900015,Curry,1,11,25,Missed Shot,Jump Shot,3PT Field Goal,26,99,249,1,0,2009-10-28,GSW,HOU
20900015,Curry,1,9,31,Made Shot,Step Back Jump shot,2PT Field Goal,18,-122,145,1,1,2009-10-28,GSW,HOU
20900015,Curry,1,6,2,Missed Shot,Jump Shot,2PT Field Goal,14,-60,129,1,0,2009-10-28,GSW,HOU


In [46]:
dfSlice.head(3)

Unnamed: 0_level_0,player_name,period,minutes_remaining,seconds_remaining,event_type,action_type,shot_type,shot_distance,loc_x,loc_y,shot_attempted_flag,shot_made_flag,game_date,home_team,visiting_team
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
20900015,Curry,1,11,25,Missed Shot,Jump Shot,3PT Field Goal,26,99,249,1,0,2009-10-28,GSW,HOU
20900015,Curry,1,9,31,Made Shot,Step Back Jump shot,2PT Field Goal,18,-122,145,1,1,2009-10-28,GSW,HOU
20900015,Curry,1,6,2,Missed Shot,Jump Shot,2PT Field Goal,14,-60,129,1,0,2009-10-28,GSW,HOU


Both Dataframes have been modified!

### use copy() method to fix this code

In [47]:
df = shots.copy(deep=True)

In [48]:
dfSlice = df.loc['0020900015',:].copy()
dfSlice.loc[:,'player_name'] = 'Curry'
df.head(2)

Unnamed: 0_level_0,player_name,period,minutes_remaining,seconds_remaining,event_type,action_type,shot_type,shot_distance,loc_x,loc_y,shot_attempted_flag,shot_made_flag,game_date,home_team,visiting_team
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
20900015,Stephen Curry,1,11,25,Missed Shot,Jump Shot,3PT Field Goal,26,99,249,1,0,2009-10-28,GSW,HOU
20900015,Stephen Curry,1,9,31,Made Shot,Step Back Jump shot,2PT Field Goal,18,-122,145,1,1,2009-10-28,GSW,HOU


In [49]:
dfSlice.head(2)

Unnamed: 0_level_0,player_name,period,minutes_remaining,seconds_remaining,event_type,action_type,shot_type,shot_distance,loc_x,loc_y,shot_attempted_flag,shot_made_flag,game_date,home_team,visiting_team
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
20900015,Curry,1,11,25,Missed Shot,Jump Shot,3PT Field Goal,26,99,249,1,0,2009-10-28,GSW,HOU
20900015,Curry,1,9,31,Made Shot,Step Back Jump shot,2PT Field Goal,18,-122,145,1,1,2009-10-28,GSW,HOU


Only dfSlice Dataframe modified!