# **Set Operations**

NumPy provides various set operations functions such as union, intersect etc. 

In [None]:
import numpy as np

In [None]:
dt = np.dtype([("Manufacturer", np.unicode_, 16),
               ("Model", np.unicode_, 16),
               ("Type", np.unicode_, 16),
               ("Min.Price", np.float64),
               ("Price", np.float64),
               ("MAX.Price", np.float64),
               ("MPG.city", np.float64),
               ("MPG,highway", np.float64),
               ("AirBags", np.unicode_, 16),
               ("DriveTrain", np.unicode_, 16),
               ("Cylinders", np.unicode_, 16),
               ("EngineSize", np.float64),
               ("Horsepower", np.float64),
               ("RPM", np.float64),
               ("Rev.per.mile", np.float64),
               ("Man.trans.avail", np.unicode_, 16),
               ("Fuel.tank.capacity", np.float64),
               ("Passengers", np.float64),
               ("Length", np.float64),
               ("Wheelbase", np.float64),
               ("Width", np.float64),
               ("Turn.circle", np.float64),
               ("Rear.seat.room", np.float64),
               ("Luggage.room", np.float64),
               ("Weight", np.float64),
               ("Origin", np.unicode_, 16),
               ("Make", np.unicode_, 16)
               ])
dt

dtype([('Manufacturer', '<U16'), ('Model', '<U16'), ('Type', '<U16'), ('Min.Price', '<f8'), ('Price', '<f8'), ('MAX.Price', '<f8'), ('MPG.city', '<f8'), ('MPG,highway', '<f8'), ('AirBags', '<U16'), ('DriveTrain', '<U16'), ('Cylinders', '<U16'), ('EngineSize', '<f8'), ('Horsepower', '<f8'), ('RPM', '<f8'), ('Rev.per.mile', '<f8'), ('Man.trans.avail', '<U16'), ('Fuel.tank.capacity', '<f8'), ('Passengers', '<f8'), ('Length', '<f8'), ('Wheelbase', '<f8'), ('Width', '<f8'), ('Turn.circle', '<f8'), ('Rear.seat.room', '<f8'), ('Luggage.room', '<f8'), ('Weight', '<f8'), ('Origin', '<U16'), ('Make', '<U16')])

In [None]:
import numpy as np
data = np.genfromtxt('Datasets/cars93.csv',
                     delimiter = ',',
                     dtype = dt,
                     skip_header = 1)
data

In [None]:
data.shape

(93,)

In [None]:
data['Manufacturer']

**Get unique Manufacturer names**<br>
Use `np.unique()`. A side-effect is it arranges items in ascending order.

In [None]:
np.unique(data['Manufacturer'])

array(['"Acura"', '"Audi"', '"BMW"', '"Buick"', '"Cadillac"',
       '"Chevrolet"', '"Chrylser"', '"Chrysler"', '"Dodge"', '"Eagle"',
       '"Ford"', '"Geo"', '"Honda"', '"Hyundai"', '"Infiniti"', '"Lexus"',
       '"Lincoln"', '"Mazda"', '"Mercedes-Benz"', '"Mercury"',
       '"Mitsubishi"', '"Nissan"', '"Oldsmobile"', '"Plymouth"',
       '"Pontiac"', '"Saab"', '"Saturn"', '"Subaru"', '"Suzuki"',
       '"Toyota"', '"Volkswagen"', '"Volvo"'], dtype='<U16')

**Counts**

In [None]:
key, values = np.unique(data['Manufacturer'], return_counts=True)
dict(zip(key, values))

{'"Acura"': 2,
 '"Audi"': 2,
 '"BMW"': 1,
 '"Buick"': 4,
 '"Cadillac"': 2,
 '"Chevrolet"': 8,
 '"Chrylser"': 1,
 '"Chrysler"': 2,
 '"Dodge"': 6,
 '"Eagle"': 2,
 '"Ford"': 8,
 '"Geo"': 2,
 '"Honda"': 3,
 '"Hyundai"': 4,
 '"Infiniti"': 1,
 '"Lexus"': 2,
 '"Lincoln"': 2,
 '"Mazda"': 5,
 '"Mercedes-Benz"': 2,
 '"Mercury"': 2,
 '"Mitsubishi"': 2,
 '"Nissan"': 4,
 '"Oldsmobile"': 4,
 '"Plymouth"': 1,
 '"Pontiac"': 5,
 '"Saab"': 1,
 '"Saturn"': 1,
 '"Subaru"': 3,
 '"Suzuki"': 1,
 '"Toyota"': 4,
 '"Volkswagen"': 4,
 '"Volvo"': 2}

**Remove Duplicates**<br>
Setting `return_index=True` will return the row positions of the first occurences of the unique values. We can use these indices to get ridof the duplicates.

In [None]:
unique, index = np.unique(data[['Manufacturer']], axis=0, return_index=True)
unique, index

In [None]:
# Unique at 'Manufacturer and Model' level.
# unique, index = np.unique(data[['Manufacturer', 'Model']], axis=0, return_index=True)
# unique, index

In [None]:
# keep the first occurences
data[index]

array([('"Acura"', '"Integra"', '"Small"', 12.9, 15.9, 18.8, 25., 31., '"None"', '"Front"', '"4"', 1.8, 140., 6300., 2890., '"Yes"', 13.2, 5., 177., 102., 68., 37., 26.5, 11., 2705., '"non-USA"', '"Acura Integra"'),
       ('"Audi"', '"90"', '"Compact"', 25.9, 29.1, 32.3, 20., 26., '"Driver only"', '"Front"', '"6"', 2.8, 172., 5500., 2280., '"Yes"', 16.9, 5., 180., 102., 67., 37., 28. , 14., 3375., '"non-USA"', '"Audi 90"'),
       ('"BMW"', '"535i"', '"Midsize"', 23.7, 30. , 36.2, 22., 30., '"Driver only"', '"Rear"', '"4"', 3.5, 208., 5700., 2545., '"Yes"', 21.1, 4., 186., 109., 69., 39., 27. , 13., 3640., '"non-USA"', '"BMW 535i"'),
       ('"Buick"', '"Century"', '"Midsize"', 14.2, 15.7, 17.3, 22., 31., '"Driver only"', '"Front"', '"4"', 2.2, 110., 5200., 2565., '"No"', 16.4, 6., 189., 105., 69., 41., 28. , 16., 2880., '"USA"', '"Buick Century"'),
       ('"Cadillac"', '"DeVille"', '"Large"', 33. , 34.7, 36.3, 16., 25., '"Driver only"', '"Front"', '"8"', 4.9, 200., 4100., 1510., '"N

**Keep Only Duplicates:**<br>
Remove the first occurences

In [None]:
# keep all except first occurences
data[~index]

array([('"Volvo"', '"850"', '"Midsize"', 24.8, 26.7, 28.5, 20., 28., '"Driver & Passen', '"Front"', '"5"', 2.4, 168., 6200., 2310., '"Yes"', 19.3, 5., 184., 105., 69., 38., 30. , 15., 3245., '"non-USA"', '"Volvo 850"'),
       ('"Volkswagen"', '"Corrado"', '"Sporty"', 22.9, 23.3, 23.7, 18., 25., '"None"', '"Front"', '"6"', 2.8, 178., 5800., 2385., '"Yes"', 18.5, 4., 159.,  97., 66., 36., 26. , 15., 2810., '"non-USA"', '"Volkswagen Corr'),
       ('"Volkswagen"', '"Eurovan"', '"Van"', 16.6, 19.7, 22.7, 17., 21., '"None"', '"Front"', '"5"', 2.5, 109., 4500., 2915., '"Yes"', 21.1, 7., 187., 115., 72., 38., 34. , nan, 3960., '"non-USA"', '"Volkswagen Euro'),
       ('"Volkswagen"', '"Fox"', '"Small"',  8.7,  9.1,  9.5, 25., 33., '"None"', '"Front"', '"4"', 1.8,  81., 5500., 2550., '"Yes"', 12.4, 4., 163.,  93., 63., 34., 26. , 10., 2240., '"non-USA"', '"Volkswagen Fox"'),
       ('"Toyota"', '"Tercel"', '"Small"',  7.8,  9.8, 11.8, 32., 37., '"Driver only"', '"Front"', '"4"', 1.5,  82., 52

**Check Membership**

In [None]:
A = np.random.choice(data['Manufacturer'], 20, replace=True)
B = np.random.choice(data['Manufacturer'], 10, replace=True)

print(A)
print(B)

['"Subaru"' '"Pontiac"' '"Oldsmobile"' '"Subaru"' '"Toyota"'
 '"Oldsmobile"' '"Pontiac"' '"Audi"' '"Oldsmobile"' '"Buick"' '"Toyota"'
 '"Oldsmobile"' '"Ford"' '"BMW"' '"Volkswagen"' '"Mazda"' '"Pontiac"'
 '"Mazda"' '"Volkswagen"' '"Nissan"']
['"Ford"' '"Cadillac"' '"Volvo"' '"Plymouth"' '"Geo"' '"Lexus"' '"Mazda"'
 '"Hyundai"' '"Nissan"' '"Mazda"']


**Find elements of A that are present in B**

In [None]:
A_in_B = np.in1d(A, B)
A_in_B

array([False, False, False, False, False, False, False, False, False,
       False, False, False,  True, False, False,  True, False,  True,
       False,  True])

In [None]:
A[A_in_B]

array(['"Ford"', '"Mazda"', '"Mazda"', '"Nissan"'], dtype='<U16')

**Union**

In [None]:
np.union1d(A, B)

array(['"Audi"', '"BMW"', '"Buick"', '"Cadillac"', '"Ford"', '"Geo"',
       '"Hyundai"', '"Lexus"', '"Mazda"', '"Nissan"', '"Oldsmobile"',
       '"Plymouth"', '"Pontiac"', '"Subaru"', '"Toyota"', '"Volkswagen"',
       '"Volvo"'], dtype='<U16')

**Intersection**

In [None]:
np.intersect1d(A, B)

array(['"Ford"', '"Mazda"', '"Nissan"'], dtype='<U16')

**Set Difference**

In [None]:
np.setdiff1d(A, B)  # A - B

array(['"Audi"', '"BMW"', '"Buick"', '"Oldsmobile"', '"Pontiac"',
       '"Subaru"', '"Toyota"', '"Volkswagen"'], dtype='<U16')