In [None]:
"""
Duplicate Emials

Write a SQL query to find all duplicate emails in a table named Person
"""

In [None]:
"""test case"""
{"headers": {"Person": ["Id", "Email"]}, "rows": {"Person": [[1, "a@b.com"], [2, "c@d.com"], [3, "a@b.com"]]}}

In [None]:
"""SQL table creation and data insertion"""
DROP TABLE IF EXISTS Person;

CREATE TABLE Person(
Id  int(4) NOT NULL,
Num varchar(32) NOT NULL
PRIMARY KEY ('Id'));

INSERT INTO Person VALUES
(1, "a@b.com"),
(2, "c@d.com"),
(3, "a@b.com");

In [None]:
"""SQL solution"""
# 1
SELECT Email
FROM Person
GROUP BY Email
HAVING COUNT(*) >= 2

# 2
SELECT DISTINCT a.Email
FROM Person AS a
INNER JOIN Person AS b
ON a.Email  = b.Email
AND a.Id != b.Id

# 3
SELECT Email
FROM 
(SELECT Email, ROW_NUMBER() OVER(PARTITION BY Email) AS rn
FROM Person) AS a
WHERE a.rn = 2

In [2]:
"""pandas dataframe creation"""
import pandas as pd

testcase = {"headers": {"Person": ["Id", "Email"]}, "rows": {"Person": [[1, "a@b.com"], [2, "c@d.com"], [3, "a@b.com"]]}}
Person = pd.DataFrame.from_dict(testcase['rows']['Person'])
Person.columns = testcase['headers']['Person']
Person.head()

Unnamed: 0,Id,Email
0,1,a@b.com
1,2,c@d.com
2,3,a@b.com


In [8]:
# 1
# determine if each row is a duplicate
Person.duplicated(subset=['Email'])

Unnamed: 0,Id,Email
2,3,a@b.com


In [11]:
# filter out the duplicates
Person[Person.duplicated(subset=['Email'])]['Email']

2    a@b.com
Name: Email, dtype: object

In [13]:
# 2 Find unique rows
Person_unique = Person.drop_duplicates(subset=['Email'])
Person_unique

Unnamed: 0,Id,Email
0,1,a@b.com
1,2,c@d.com


In [17]:
# find rows that are not unique
Person[~Person.isin(Person_unique)].dropna()['Email']

2    a@b.com
Name: Email, dtype: object