The PR is to solve issue: https://github.com/pandas-dev/pandas/issues/38340  
The PR link is: https://github.com/pandas-dev/pandas/pull/38379

### 0. Compile pandas

In [98]:
import os
pandas_path = os.environ['PANDAS_PATH']
os.chdir(pandas_path)
os.system("cd $PANDAS_PATH")
try:
    import pandas as pd
    print("Already compiled!")
except:
    print("Compiling...")
    os.system("python setup.py develop")
    os.system("pip uninstall pandas")
    print("Compiled!")

Already compiled!


In [99]:
def get_so_file_names(path, keyword):
    list_of_files = set()
    for (dirpath, dirnames, filenames) in os.walk(path):
        for filename in filenames:
            if keyword in filename: 
                list_of_files.add(os.sep.join([dirpath, filename]))
    return list_of_files

In [100]:
def rename_files(file_names, keyword):
    for file_name in file_names:
        os.rename(file_name, file_name.replace(keyword, ""))
    print(f"{len(file_names)} files renamed!")

In [101]:
keyword = ".cpython-37m-darwin"
so_file_names = get_so_file_names(pandas_path, keyword)
rename_files(so_file_names, keyword)

0 files renamed!


### 2. Issue
Currently, you can get quite a slowdown:

In [102]:
import pandas as pd
import numpy as np

arr = np.random.randint(0, 10, 1_000_001)
target = [1, 2, 3, 20]

In [103]:
s1 = pd.Series(arr)
%timeit s1.isin(target)

2.89 ms ± 69.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
s2 = pd.Series(arr, dtype="Int64")
%timeit s2.isin(target)

### 3. Analysis
We can see in `/pandas/core/series.py`, there is an `algorithms.isin` function might be the bottleneck.
```Python
def isin(self, values) -> "Series":
        result = algorithms.isin(self._values, values)
        return self._constructor(result, index=self.index).__finalize__(
            self, method="isin"
        )
```

In [None]:
import pandas as pd

In [None]:
%load_ext line_profiler

In [None]:
from pandas.core.algorithms import isin

In [None]:
%lprun -f isin isin(s2._values, target)

### 4. Solution
According to the profiling result, line 470 is the bottleneck:  
```
470         1       6074.0   6074.0     21.9          return isin(np.asarray(comps), np.asarray(values))
```

We could define and test our new isin as below, and the runtime is decreased to 2 ms.

In [None]:
from pandas.core.arrays.masked import BaseMaskedArray

def fast_isin(comps, values):
    if isinstance(comps, BaseMaskedArray):
        comps = comps._data
    return isin(comps, values)

In [None]:
%timeit fast_isin(s2._values, target)

In [None]:
%timeit isin(s2._values, target)

### 5. Tests
We should test for the Extension Arrays below:
```Python
IntegerArray
FloatingArray
BooleanArray
```

### 5.1 IntegerArray

In [None]:
s3 = pd.Series(arr, dtype="Int64")
%timeit fast_isin(s3._values, target)

In [None]:
%timeit isin(s3._values, target)

### 5.2 FloatingArray

In [None]:
s4 = pd.Series(arr, dtype="Float64")
target_f = [1.0, 2.0, 3.0, 4.0]
%timeit fast_isin(s4._values, target_f)

In [None]:
%timeit isin(s4._values, target_f)

### 5.3 boolean

In [None]:
arr_b = arr > 5
s5 = pd.Series(arr_b, dtype="boolean")
target_b = [True, True, True, True]
%timeit fast_isin(s5._values, target_b)

In [None]:
%timeit isin(s5._values, target_b)

### 5.4 pd.NA in the first array
If there is pd.NA in the first array, the result could be incorrect, but we can multiply the result with `~comps._mask`.

In [104]:
s6 = pd.Series([1, 2, 3, pd.NA, 4], dtype="Int64")
target = [1, 2, 3, 20]
fast_isin(s6._values, target)

NameError: name 'fast_isin' is not defined

In [None]:
def isin_for_masked_array(comps, values):
    if isinstance(comps, BaseMaskedArray):
        _comps = comps._data
        result = isin(_comps, values) * np.invert(comps._mask)
        return result
    return isin(comps, values)

In [None]:
isin_for_masked_array(s6._values, target)

In [None]:
%timeit fast_isin(s3._values, target)

In [None]:
%timeit isin_for_masked_array(s3._values, target)

### 5.5 pd.NA in the second array
If there is pd.NA in the second array, the result could be incorrect, but we can check if there is pd.NA in it.

In [None]:
s7 = pd.Series([1, 2, 3, pd.NA, 4], dtype="Int64")
target = [1, 2, 3, 20, pd.NA]
isin(s7._values, target)

In [None]:
# The result is incorrect
isin_for_masked_array(s7._values, target)

In [None]:
def isin_for_masked_array2(comps, values):
    # We have to be careful when values contains 1,
    # Because MaskArray's NA value will be 1 in self._data.
    if isinstance(comps, BaseMaskedArray):
        result = isin(comps._data, values) * np.invert(comps._mask)
        if any(x is pd.NA for x in values):
            result += comps._mask
        return result
    return isin(comps, values)

In [None]:
isin_for_masked_array2(pd.Series([2, 3, pd.NA, 4], dtype="Int64")._values, 
                       [2, 3, 20])

In [None]:
isin_for_masked_array2(pd.Series([2, 3, pd.NA, 4], dtype="Int64")._values, 
                       [2, 3, 20, pd.NA])

In [None]:
isin_for_masked_array2(pd.Series([2, 3, 4], dtype="Int64")._values, 
                       [2, 3, 20, pd.NA])

In [None]:
isin_for_masked_array2(pd.Series([2, 3, 4], dtype="Int64")._values, 
                       [2, 3, 20])

In [None]:
isin_for_masked_array2(pd.Series([2, 3, pd.NA, 4], dtype="Int64")._values, 
                       [1, 2, 3, 20])

In [None]:
isin_for_masked_array2(pd.Series([2, 3, pd.NA, 4], dtype="Int64")._values, 
                       [1, 2, 3, 20, pd.NA])

### 5.6 Test different array types for pd.NA existance

In [None]:
pd.isnull(pd.NA)

In [None]:
pd.isnull(pd.NaT)

In [None]:
pd.isna(pd.NA)

In [None]:
pd.isna(pd.NaT)

In [None]:
from copy import copy
pd.NA is copy(pd.NA)

In [None]:
any(x is pd.NA for x in [1, 2, pd.NA])

In [None]:
any(x is pd.NA for x in np.array([1, 2, pd.NA]))

In [None]:
any(x is pd.NA for x in pd.Series([1, 2, pd.NA]))

In [None]:
any(x is pd.NA for x in pd.Series([1, 2, pd.NA], dtype="Int64"))

In [None]:
any(x is pd.NA for x in np.array([1, 2, np.nan]))

In [None]:
any(x is pd.NA for x in [1, 2, pd.NaT])

### 5.6 Final test

In [None]:
result = pd.Series([1, 2, 3, 20], dtype="Int64").isin([1, 2, 3, 4])
result

In [None]:
result.values

In [None]:
result.values._mask

In [None]:
result.values._data

In [None]:
pd.Series([1, 2, 3, pd.NA], dtype="Int64").isin([1, 2, 3, 4])

In [None]:
pd.Series([1, 2, 3, pd.NA], dtype="Int64").isin([1, 2, 3, 4, pd.NaT])

In [None]:
pd.Series([1, 2, 3, pd.NA], dtype="Int64").isin([1, 2, 3, 4, pd.NA])

In [None]:
pd.Series([1, 2, 3], dtype="Int64").isin([1, 2, 3, 4, pd.NaT])

In [None]:
pd.Series([1, 2, 3], dtype="Int64").isin([1, 2, 3, 4, pd.NA])

In [None]:
pd.Series([1, 5], dtype="Int64").isin([1, 2, 3, 4, pd.NA])

In [None]:
pd.Series([1, 5], dtype="Int64").isin([1, 2, 3, 4])

In [None]:
pd.Series([5], dtype="Int64").isin([1, 2, 3, 4])

In [None]:
pd.Series([pd.NA], dtype="Int64").isin([1, 2, 3, 4])

In [None]:
pd.Series([], dtype="Int64").isin([1, 2, 3, 4])

In [None]:
pd.Series([1.0, 2.0, 5.0, pd.NA], dtype="Float64").isin([1.0, 2.0, 3.0, 4.0])

In [None]:
pd.Series([False, True], dtype="boolean").isin(pd.array([False], dtype="boolean"))

In [None]:
pd.Series([False, True, pd.NA], dtype="boolean").isin(pd.array([False, pd.NA], dtype="boolean"))

In [None]:
pd.Series([1, 5], dtype="Int64").isin(pd.array([1, 2, 3, 4], dtype="Int64"))

In [None]:
pd.Series([1.0, 5.0], dtype="Float64").isin(pd.array([1.0, 2.0, 3.0, 4.0], dtype="Float64"))

## 6. Why we should return True but not pd.NA

In [None]:
None in [1, 2, 3, None]

In [None]:
np.nan in np.array([1, 2, np.nan])

In [None]:
pd.NA in pd.Series([1, 2, pd.NA])

## 7. Adding benchmark for series.isin

In [None]:
pd.Series(np.random.randint(1, 10, 100000)).astype("Int64").values

In [None]:
pd.Series(np.random.randint(1, 10, 100000)).astype("int64").values

In [None]:
pd.Series(np.random.randint(1, 10, 100000)).astype("Float64").values

In [None]:
pd.Series(np.random.randint(1, 10, 100000)).astype(np.float64).values

In [None]:
np.arange(10 ** 6, dtype="Float64")

In [None]:
pd.Series(np.random.randint(0, 2, 10)).astype("boolean").values

In [None]:
pd.Series(np.random.randint(0, 2, 10)).astype("bool").values

In [None]:
class IsInFloat64:

    params = [np.float64, "Float64"]

    def setup(self, dtype):
        self.small = Series([1, 2], dtype=dtype)
        self.many_different_values = np.arange(10 ** 6, dtype=np.float64)
        self.few_different_values = np.zeros(10 ** 7, dtype=np.float64)
        self.only_nans_values = np.full(10 ** 7, np.nan, dtype=np.float64)

    def time_isin_many_different(self):
        # runtime is dominated by creation of the lookup-table
        self.small.isin(self.many_different_values)

    def time_isin_few_different(self):
        # runtime is dominated by creation of the lookup-table
        self.small.isin(self.few_different_values)

    def time_isin_nan_values(self):
        # runtime is dominated by creation of the lookup-table
        self.small.isin(self.few_different_values)
    
    def run(self):
        for para in self.params:
            self.setup(para)
            self.time_isin_many_different()
            self.time_isin_few_different()
            self.time_isin_nan_values()

In [None]:
Series = pd.Series
IsInFloat64().run()

## 8. Improve the benchmarks of IsInLongSeriesValuesDominate

In [4]:
import numpy as np
Series = pd.Series
from pandas.core.algorithms import isin

In [5]:
class IsInLongSeriesValuesDominate:
    def setup(self, dtype="Float64", series_type="monotone"):
        N = 10 ** 7
        if series_type == "random":
            np.random.seed(42)
            vals = np.random.randint(0, 10 * N, N)
        if series_type == "monotone":
            vals = np.arange(N)
        self.values = vals.astype(dtype)
        M = 10 ** 6 + 1
        self.series = Series(np.arange(M)).astype(dtype)

    def time_isin(self, dtypes="Float64", series_type="monotone"):
        self.series.isin(self.values)
    
    def run(self):
        self.time_isin()

In [6]:
obj = IsInLongSeriesValuesDominate()
obj.setup()
%timeit obj.run()

  if __name__ == '__main__':


541 ms ± 8.86 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
class IsInLongSeriesValuesDominate:
    def setup(self, dtype="float64", series_type="monotone"):
        N = 10 ** 7
        if series_type == "random":
            np.random.seed(42)
            vals = np.random.randint(0, 10 * N, N)
        if series_type == "monotone":
            vals = np.arange(N)
        self.values = vals.astype(dtype)
        M = 10 ** 6 + 1
        self.series = Series(np.arange(M)).astype(dtype)

    def time_isin(self, dtypes="float64", series_type="monotone"):
        self.series.isin(self.values)
    
    def run(self):
        self.time_isin()

In [None]:
obj = IsInLongSeriesValuesDominate()
obj.setup()
%timeit obj.run()

In [None]:
%timeit isin(series, target)

In [None]:
%load_ext line_profiler
%lprun -f isin isin(series, target)

## 9. Polish code

### 9.1 algorithm.isin

In [8]:
from pandas.core.dtypes.common import is_extension_array_dtype

In [12]:
is_extension_array_dtype(Series([1, 2], dtype="Int64").values)

True

In [13]:
is_extension_array_dtype(Series([1, 2], dtype="Float64").values)

True

In [14]:
is_extension_array_dtype(Series([0, 1], dtype="boolean").values)

True

In [18]:
is_extension_array_dtype(Series([0, 1], dtype="int64").values)

False

In [24]:
Series(["a", "b"], dtype="string").isin(["b", "c"])

0    False
1     True
dtype: bool

In [38]:
Series([pd.Interval(1, 2), pd.Interval(5, 6)], dtype="interval").values

array([Interval(1, 2, closed='right'), Interval(5, 6, closed='right')],
      dtype=object)

In [45]:
pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]).isin([pd.Interval(1, 5)])

array([False,  True])

### 9.2 dtype=Boolean

In [46]:
Series(np.random.randint(0, 2, 100000)).astype("Boolean")

TypeError: data type 'Boolean' not understood

In [47]:
np.array([1, 0], dtype="Boolean")

TypeError: data type 'Boolean' not understood

### 9.3 pandas/tests/arithmetic/test_interval.py error
Due to remove lines 
```Python
elif is_interval_dtype(comps.dtype):
    return cast("IntervalArray", comps).isin(values)
```
in algorithms.py

In [63]:
import operator
from pandas.core.dtypes.common import is_list_like
import pandas._testing as tm
from pandas import IntervalIndex
from pandas.core.arrays import IntervalArray


def elementwise_comparison(op, array, other):
    """
    Helper that performs elementwise comparisons between `array` and `other`
    """
    other = other if is_list_like(other) else [other] * len(array)
    expected = np.array([op(x, y) for x, y in zip(array, other)])
    if isinstance(other, Series):
        return Series(expected, index=other.index)
    return expected


constructor = pd.core.series.Series
op = operator.ne
expected_type = pd.core.series.Series
assert_func = tm.assert_series_equal

breaks = range(4)
index = constructor(IntervalIndex.from_breaks(breaks))

# scalar comparisons
other = index[0]
result = op(index, other)
expected = expected_type(elementwise_comparison(op, index, other))
assert_func(result, expected)

other = breaks[0]
result = op(index, other)
expected = expected_type(elementwise_comparison(op, index, other))
assert_func(result, expected)

# list-like comparisons
other = IntervalArray.from_breaks(breaks)
result = op(index, other)
expected = expected_type(elementwise_comparison(op, index, other))
assert_func(result, expected)

other = [index[0], breaks[0], "foo"]
result = op(index, other)
expected = expected_type(elementwise_comparison(op, index, other))
assert_func(result, expected)

In [134]:
s6 = pd.Series(list(range(10000)) + [pd.NA], dtype="Int64")

In [135]:
%timeit s6.values._hasna

2.82 µs ± 29.7 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [136]:
%timeit pd.NA in s6

3.3 µs ± 13.2 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [137]:
%timeit any(pd.NA is x for x in s6)

4.35 ms ± 194 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [144]:
s8 = pd.Series(["a"] * 1000 + [pd.NA])

In [145]:
%timeit pd.NA in s8

3.46 µs ± 101 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [146]:
%timeit any(pd.NA is x for x in s8)

167 µs ± 1.65 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [152]:
s8.values.dtype == "object"

True