# Chapter 6  Index Alignment

# 1.Examining the Index object

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
#1. Read in the college dataset, assign for the column index to a variable, and output it:
college = pd.read_csv('college.csv')
columns = college.columns
columns

Index(['INSTNM', 'CITY', 'STABBR', 'HBCU', 'MENONLY', 'WOMENONLY', 'RELAFFIL',
       'SATVRMID', 'SATMTMID', 'DISTANCEONLY', 'UGDS', 'UGDS_WHITE',
       'UGDS_BLACK', 'UGDS_HISP', 'UGDS_ASIAN', 'UGDS_AIAN', 'UGDS_NHPI',
       'UGDS_2MOR', 'UGDS_NRA', 'UGDS_UNKN', 'PPTUG_EF', 'CURROPER', 'PCTPELL',
       'PCTFLOAN', 'UG25ABV', 'MD_EARN_WNE_P10', 'GRAD_DEBT_MDN_SUPP'],
      dtype='object')

In [5]:
#2. Use the values attribute to access the underlying NumPy array:
columns.values

array(['INSTNM', 'CITY', 'STABBR', 'HBCU', 'MENONLY', 'WOMENONLY',
       'RELAFFIL', 'SATVRMID', 'SATMTMID', 'DISTANCEONLY', 'UGDS',
       'UGDS_WHITE', 'UGDS_BLACK', 'UGDS_HISP', 'UGDS_ASIAN', 'UGDS_AIAN',
       'UGDS_NHPI', 'UGDS_2MOR', 'UGDS_NRA', 'UGDS_UNKN', 'PPTUG_EF',
       'CURROPER', 'PCTPELL', 'PCTFLOAN', 'UG25ABV', 'MD_EARN_WNE_P10',
       'GRAD_DEBT_MDN_SUPP'], dtype=object)

In [6]:
#3. Select items from the index by integer location with scalars, lists, or slices:
columns[5]


'WOMENONLY'

In [7]:
columns[[1,8,10]]

Index(['CITY', 'SATMTMID', 'UGDS'], dtype='object')

In [8]:
columns[-7:-4]

Index(['PPTUG_EF', 'CURROPER', 'PCTPELL'], dtype='object')

In [9]:
#4. Indexes share many of the same methods as Series and DataFrames:
columns.min(), columns.max(), columns.isnull().sum()

('CITY', 'WOMENONLY', 0)

In [10]:
columns.isnull().sum() # counts no. of the null values in the columns 

0

In [11]:
#5. Use basic arithmetic and comparison operators directly on Index objects:
columns + '_A'

Index(['INSTNM_A', 'CITY_A', 'STABBR_A', 'HBCU_A', 'MENONLY_A', 'WOMENONLY_A',
       'RELAFFIL_A', 'SATVRMID_A', 'SATMTMID_A', 'DISTANCEONLY_A', 'UGDS_A',
       'UGDS_WHITE_A', 'UGDS_BLACK_A', 'UGDS_HISP_A', 'UGDS_ASIAN_A',
       'UGDS_AIAN_A', 'UGDS_NHPI_A', 'UGDS_2MOR_A', 'UGDS_NRA_A',
       'UGDS_UNKN_A', 'PPTUG_EF_A', 'CURROPER_A', 'PCTPELL_A', 'PCTFLOAN_A',
       'UG25ABV_A', 'MD_EARN_WNE_P10_A', 'GRAD_DEBT_MDN_SUPP_A'],
      dtype='object')

In [12]:
columns > 'G'

array([ True, False,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True])

In [13]:
#6. Trying to change an Index value directly after its creation fails. Indexes are immutable objects:
columns[1] = 'city'

TypeError: Index does not support mutable operations

In [14]:
#Indexes support the set operations, union, intersection, difference, and symmetric difference:
c1 = columns[:4]
c1

Index(['INSTNM', 'CITY', 'STABBR', 'HBCU'], dtype='object')

In [15]:
c2 = columns[2:6]
c2

Index(['STABBR', 'HBCU', 'MENONLY', 'WOMENONLY'], dtype='object')

In [16]:
c1.union(c2) # or `c1 | c2`


Index(['CITY', 'HBCU', 'INSTNM', 'MENONLY', 'STABBR', 'WOMENONLY'], dtype='object')

In [17]:
c1.symmetric_difference(c2) # or `c1 ^ c2`

Index(['CITY', 'INSTNM', 'MENONLY', 'WOMENONLY'], dtype='object')

In [18]:
c1

Index(['INSTNM', 'CITY', 'STABBR', 'HBCU'], dtype='object')

In [19]:
c2

Index(['STABBR', 'HBCU', 'MENONLY', 'WOMENONLY'], dtype='object')

# 2.Producing Cartesian products

In [20]:
#1. Construct two Series that have indexes that are different but contain some of the same values:
s1 = pd.Series(index=list('aaab'), data=np.arange(4))
s1

a    0
a    1
a    2
b    3
dtype: int32

In [21]:
s2 = pd.Series(index=list('cababb'), data=np.arange(6))
s2

c    0
a    1
b    2
a    3
b    4
b    5
dtype: int32

In [22]:
#2. Add the two Series together to produce a Cartesian product:
s1 + s2

a    1.0
a    3.0
a    2.0
a    4.0
a    3.0
a    5.0
b    5.0
b    7.0
b    8.0
c    NaN
dtype: float64

In [23]:
s1 = pd.Series(index=list('aaabb'), data=np.arange(5))
s2 = pd.Series(index=list('aaabb'), data=np.arange(5))
s1 + s2

a    0
a    2
a    4
b    6
b    8
dtype: int32

In [24]:
"""If the elements of the index are identical, but the order is different between the Series, a
Cartesian product occurs. Let's change the order of the index in s2 and rerun the same
operation:
"""    
s1 = pd.Series(index=list('aaabb'), data=np.arange(5))
s2 = pd.Series(index=list('bbaaa'), data=np.arange(5))
s1 + s2


a    2
a    3
a    4
a    3
a    4
a    5
a    4
a    5
a    6
b    3
b    4
b    4
b    5
dtype: int32

# 3.Exploding indexes
The previous recipe walked through a trivial example of two small Series being added
together with unequal indexes. This problem can produce comically incorrect results when
dealing with larger data.

In [25]:
#1.1. Read in the employee data and set the index equal to the race column:
movie= pd.read_csv('movie.csv')
movie.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [27]:
review1=movie["num_critic_for_reviews"]
review2=movie["num_critic_for_reviews"]
review1 is review2      #The salary1 and salary2 variables are actually referring to the same object.
                        #This means that any change to one will change the other. To ensure that you
                        #receive a brand new copy of the data, use the copy method:


True

In [28]:
review1=movie["num_critic_for_reviews"].copy()
review2=movie["num_critic_for_reviews"].copy()
review1 is review2

False

In [29]:
#4. Let's change the order of the index for one of the Series by sorting it:
movie

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4911,Color,Scott Smith,1.0,87.0,2.0,318.0,Daphne Zuniga,637.0,,Comedy|Drama,...,6.0,English,Canada,,,2013.0,470.0,7.7,,84
4912,Color,,43.0,43.0,,319.0,Valorie Curry,841.0,,Crime|Drama|Mystery|Thriller,...,359.0,English,USA,TV-14,,,593.0,7.5,16.00,32000
4913,Color,Benjamin Roberds,13.0,76.0,0.0,0.0,Maxwell Moody,0.0,,Drama|Horror|Thriller,...,3.0,English,USA,,1400.0,2013.0,0.0,6.3,,16
4914,Color,Daniel Hsia,14.0,100.0,0.0,489.0,Daniel Henney,946.0,10443.0,Comedy|Drama|Romance,...,9.0,English,USA,PG-13,,2012.0,719.0,6.3,2.35,660


In [31]:
movies=pd.read_csv("movie.csv",index_col="genres")

In [32]:
movies

Unnamed: 0_level_0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,actor_1_name,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
genres,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Action|Adventure|Fantasy|Sci-Fi,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,CCH Pounder,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
Action|Adventure|Fantasy,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Johnny Depp,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
Action|Adventure|Thriller,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Christoph Waltz,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
Action|Thriller,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Tom Hardy,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
Documentary,,Doug Walker,,,131.0,,Rob Walker,131.0,,Doug Walker,...,,,,,,,12.0,7.1,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Comedy|Drama,Color,Scott Smith,1.0,87.0,2.0,318.0,Daphne Zuniga,637.0,,Eric Mabius,...,6.0,English,Canada,,,2013.0,470.0,7.7,,84
Crime|Drama|Mystery|Thriller,Color,,43.0,43.0,,319.0,Valorie Curry,841.0,,Natalie Zea,...,359.0,English,USA,TV-14,,,593.0,7.5,16.00,32000
Drama|Horror|Thriller,Color,Benjamin Roberds,13.0,76.0,0.0,0.0,Maxwell Moody,0.0,,Eva Boehnke,...,3.0,English,USA,,1400.0,2013.0,0.0,6.3,,16
Comedy|Drama|Romance,Color,Daniel Hsia,14.0,100.0,0.0,489.0,Daniel Henney,946.0,10443.0,Alan Ruck,...,9.0,English,USA,PG-13,,2012.0,719.0,6.3,2.35,660


In [33]:
movies.sort_index()

Unnamed: 0_level_0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,actor_1_name,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
genres,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Action,Color,Alec Asten,1.0,82.0,5.0,237.0,Dennis L.A. White,472.0,,Luis Sanchez,...,5.0,English,USA,R,500000.0,2015.0,251.0,4.3,,62
Action,,John Stockwell,2.0,90.0,134.0,354.0,T.J. Storm,260000.0,,Matthew Ziff,...,1.0,,USA,,17000000.0,2016.0,454.0,9.1,,0
Action,Color,RZA,208.0,107.0,561.0,353.0,RZA,746.0,15608545.0,Rick Yune,...,224.0,English,USA,R,15000000.0,2012.0,561.0,5.4,2.35,29000
Action,Color,Tim Burton,153.0,126.0,13000.0,390.0,Vincent Schiavelli,920.0,162831698.0,Michael Gough,...,610.0,English,USA,PG-13,80000000.0,1992.0,811.0,7.0,1.85,0
Action,Color,Chao-Bin Su,50.0,117.0,2.0,21.0,Shawn Yue,149.0,,Woo-sung Jung,...,34.0,Mandarin,China,R,12000000.0,2010.0,32.0,6.9,2.35,1000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Western,Color,Robert M. Young,,105.0,0.0,467.0,Bruce McGill,883.0,,Barry Corbin,...,2.0,English,USA,,1250000.0,1982.0,655.0,7.1,,32
Western,Color,Clint Eastwood,38.0,115.0,16000.0,240.0,Chris Penn,16000.0,41400000.0,Clint Eastwood,...,138.0,English,USA,R,6900000.0,1985.0,455.0,7.3,2.35,0
Western,Color,Leonard Farlinger,8.0,89.0,0.0,413.0,Gary Farmer,584.0,,Keith Carradine,...,8.0,English,Canada,R,5000000.0,2007.0,580.0,5.3,2.35,31
Western,Color,John Ford,55.0,103.0,673.0,89.0,Ben Johnson,281.0,,Harry Carey Jr.,...,90.0,English,USA,Unrated,1600000.0,1949.0,230.0,7.4,1.37,721


In [34]:
movies=movies.sort_index()

In [35]:
movies

Unnamed: 0_level_0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,actor_1_name,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
genres,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Action,Color,Alec Asten,1.0,82.0,5.0,237.0,Dennis L.A. White,472.0,,Luis Sanchez,...,5.0,English,USA,R,500000.0,2015.0,251.0,4.3,,62
Action,,John Stockwell,2.0,90.0,134.0,354.0,T.J. Storm,260000.0,,Matthew Ziff,...,1.0,,USA,,17000000.0,2016.0,454.0,9.1,,0
Action,Color,RZA,208.0,107.0,561.0,353.0,RZA,746.0,15608545.0,Rick Yune,...,224.0,English,USA,R,15000000.0,2012.0,561.0,5.4,2.35,29000
Action,Color,Tim Burton,153.0,126.0,13000.0,390.0,Vincent Schiavelli,920.0,162831698.0,Michael Gough,...,610.0,English,USA,PG-13,80000000.0,1992.0,811.0,7.0,1.85,0
Action,Color,Chao-Bin Su,50.0,117.0,2.0,21.0,Shawn Yue,149.0,,Woo-sung Jung,...,34.0,Mandarin,China,R,12000000.0,2010.0,32.0,6.9,2.35,1000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Western,Color,Robert M. Young,,105.0,0.0,467.0,Bruce McGill,883.0,,Barry Corbin,...,2.0,English,USA,,1250000.0,1982.0,655.0,7.1,,32
Western,Color,Clint Eastwood,38.0,115.0,16000.0,240.0,Chris Penn,16000.0,41400000.0,Clint Eastwood,...,138.0,English,USA,R,6900000.0,1985.0,455.0,7.3,2.35,0
Western,Color,Leonard Farlinger,8.0,89.0,0.0,413.0,Gary Farmer,584.0,,Keith Carradine,...,8.0,English,Canada,R,5000000.0,2007.0,580.0,5.3,2.35,31
Western,Color,John Ford,55.0,103.0,673.0,89.0,Ben Johnson,281.0,,Harry Carey Jr.,...,90.0,English,USA,Unrated,1600000.0,1949.0,230.0,7.4,1.37,721


In [38]:
#5. Let's add these imdb_score Series together:
moviesadd=movies["imdb_score"]+movies["imdb_score"]
moviesadd.head

<bound method NDFrame.head of genres
Action      8.6
Action     18.2
Action     10.8
Action     14.0
Action     13.8
           ... 
Western    14.2
Western    14.6
Western    10.6
Western    14.8
Western    17.8
Name: imdb_score, Length: 4916, dtype: float64>

#  4. Filling values with unequal indexes

# 5.Appending columns from different DataFrames

In [40]:
# 1. Import the employee data and select the DEPARTMENT and BASE_SALARY
#columns in a new DataFrame:
employee = pd.read_csv('employee.csv')
dept_sal = employee[['DEPARTMENT', 'BASE_SALARY']]


FileNotFoundError: [Errno 2] No such file or directory: 'employee.csv'

# Highlighting the maximum value from each column

# Replicating idxmax with method chaining

# Finding the most common maximum