In [2]:
import pandas as pd
import numpy as np

# Consequences of Duplicate Labels

Some pandas methods (Series.reindex() for example) just don’t work with duplicates present. The output can’t be determined, and so pandas raises.

In [4]:
s1 = pd.Series([0, 1, 2], index=["a", "b", "b"])

s1.reindex(["a", "b", "c"])

ValueError: cannot reindex on an axis with duplicate labels

In [5]:
df1 = pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=["A", "A", "B"])

df1

Unnamed: 0,A,A.1,B
0,0,1,2
1,3,4,5


In [6]:
df1["B"]  # a series


0    2
1    5
Name: B, dtype: int64

In [7]:
df1["A"]  # a DataFrame


Unnamed: 0,A,A.1
0,0,1
1,3,4


In [8]:
df2 = pd.DataFrame({"A": [0, 1, 2]}, index=["a", "a", "b"])

df2

Unnamed: 0,A
a,0
a,1
b,2


In [9]:
df2.loc["b", "A"]  # a scalar


2

In [10]:
df2.loc["a", "A"]  # a Series


a    0
a    1
Name: A, dtype: int64

# Duplicate Label Detection

In [11]:
df2

Unnamed: 0,A
a,0
a,1
b,2


In [12]:
df2.index.is_unique

False

In [13]:
df2.columns.is_unique


True

In [14]:
df2.index.duplicated()


array([False,  True, False])

In [15]:
df2.loc[~df2.index.duplicated(), :]


Unnamed: 0,A
a,0
b,2


In [17]:
# A solution to avoid dropping the duplicate value consists to use groupby

df2.groupby(level=0).mean()


Unnamed: 0,A
a,0.5
b,2.0


# Disallowing Duplicate Labels

In [18]:
pd.Series([0, 1, 2], index=["a", "b", "b"]).set_flags(allows_duplicate_labels=False)


DuplicateLabelError: Index has duplicates.
      positions
label          
b        [1, 2]

In [22]:
df = pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=["A", "B", "C"],).set_flags(
    allows_duplicate_labels=False
)

In [23]:
df

Unnamed: 0,A,B,C
0,0,1,2
1,3,4,5


In [24]:
df.flags.allows_duplicate_labels

False

In [25]:
df2 = df.set_flags(allows_duplicate_labels=True)
df2.flags.allows_duplicate_labels


True

In [27]:
df2.flags.allows_duplicate_labels = False
df2.flags.allows_duplicate_labels



False

In [29]:
raw = pd.read_csv("../../data/baseball.csv")
deduplicated = raw.groupby(level=0).first()  # remove duplicates
deduplicated.flags.allows_duplicate_labels = False  # disallow going forward

In [31]:
raw = pd.read_csv("../../data/baseball.csv")
raw

Unnamed: 0,id,player,year,stint,team,lg,g,ab,r,h,...,rbi,sb,cs,bb,so,ibb,hbp,sh,sf,gidp
0,88641,womacto01,2006,2,CHN,NL,19,50,6,14,...,2.0,1.0,1.0,4,4.0,0.0,0.0,3.0,0.0,0.0
1,88643,schilcu01,2006,1,BOS,AL,31,2,0,1,...,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0
2,88645,myersmi01,2006,1,NYA,AL,62,0,0,0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,88649,helliri01,2006,1,MIL,NL,20,3,0,0,...,0.0,0.0,0.0,0,2.0,0.0,0.0,0.0,0.0,0.0
4,88650,johnsra05,2006,1,NYA,AL,33,6,0,1,...,0.0,0.0,0.0,0,4.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,89525,benitar01,2007,2,FLO,NL,34,0,0,0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
96,89526,benitar01,2007,1,SFN,NL,19,0,0,0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
97,89530,ausmubr01,2007,1,HOU,NL,117,349,38,82,...,25.0,6.0,1.0,37,74.0,3.0,6.0,4.0,1.0,11.0
98,89533,aloumo01,2007,1,NYN,NL,87,328,51,112,...,49.0,3.0,0.0,27,30.0,5.0,2.0,0.0,3.0,13.0


In [32]:
raw.groupby(level=0).first()

Unnamed: 0,id,player,year,stint,team,lg,g,ab,r,h,...,rbi,sb,cs,bb,so,ibb,hbp,sh,sf,gidp
0,88641,womacto01,2006,2,CHN,NL,19,50,6,14,...,2.0,1.0,1.0,4,4.0,0.0,0.0,3.0,0.0,0.0
1,88643,schilcu01,2006,1,BOS,AL,31,2,0,1,...,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0
2,88645,myersmi01,2006,1,NYA,AL,62,0,0,0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,88649,helliri01,2006,1,MIL,NL,20,3,0,0,...,0.0,0.0,0.0,0,2.0,0.0,0.0,0.0,0.0,0.0
4,88650,johnsra05,2006,1,NYA,AL,33,6,0,1,...,0.0,0.0,0.0,0,4.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,89525,benitar01,2007,2,FLO,NL,34,0,0,0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
96,89526,benitar01,2007,1,SFN,NL,19,0,0,0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
97,89530,ausmubr01,2007,1,HOU,NL,117,349,38,82,...,25.0,6.0,1.0,37,74.0,3.0,6.0,4.0,1.0,11.0
98,89533,aloumo01,2007,1,NYN,NL,87,328,51,112,...,49.0,3.0,0.0,27,30.0,5.0,2.0,0.0,3.0,13.0


## Duplicate Label Propagation


In [33]:
s1 = pd.Series(0, index=["a", "b"]).set_flags(allows_duplicate_labels=False)

s1

a    0
b    0
dtype: int64