add spark api

vangj · vangj · commit 93b0661fbd19 · 2020-11-18T01:25:42.000-05:00
diff --git a/misc/SPARK.md b/misc/SPARK.md
@@ -0,0 +1,18 @@
+# Spark Tinkering
+
+To run Spark + Jupyter container. Then go to [http://localhost:8888](http://localhost:8888).
+
+```bash
+docker run -it \
+    -p 9870:9870 \
+    -p 8088:8088 \
+    -p 8080:8080 \
+    -p 18080:18080 \
+    -p 9000:9000 \
+    -p 8888:8888 \
+    -p 9864:9864 \
+    -v $HOME/git/py-pair/misc/ipynb:/root/ipynb \
+    -e PYSPARK_MASTER=spark://localhost:7077 \
+    -e NOTEBOOK_PASSWORD='' \
+    oneoffcoder/spark-jupyter
+```
diff --git a/misc/ipynb/binary-binary.ipynb b/misc/ipynb/binary-binary.ipynb
@@ -0,0 +1,135 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "get_data = lambda x, y, n: [(x, y) * 2 for _ in range(n)]\n",
+    "data = get_data(1, 1, 207) + get_data(1, 0, 282) + get_data(0, 1, 231) + get_data(0, 0, 242)\n",
+    "pdf = pd.DataFrame(data, columns=['x1', 'x2', 'x3', 'x4'])\n",
+    "sdf = sqlContext.createDataFrame(pdf)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "root\n",
+      " |-- x1: long (nullable = true)\n",
+      " |-- x2: long (nullable = true)\n",
+      " |-- x3: long (nullable = true)\n",
+      " |-- x4: long (nullable = true)\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "sdf.printSchema()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from itertools import combinations\n",
+    "\n",
+    "def to_counts(d):\n",
+    "    def as_key(k1, k2):\n",
+    "        keys = sorted([k1, k2])\n",
+    "        return keys[0], keys[1]\n",
+    "    \n",
+    "    def as_count(v1, v2):\n",
+    "        a, b, c, d = 0, 0, 0, 0\n",
+    "        if v1 is not None and v2 is not None:\n",
+    "            if v1 == 1 and v2 == 1:\n",
+    "                a = 1\n",
+    "            elif v1 == 1 and v2 == 0:\n",
+    "                b = 1\n",
+    "            elif v1 == 0 and v2 == 1:\n",
+    "                c = 1\n",
+    "            else:\n",
+    "                d = 1\n",
+    "        return a, b, c, d\n",
+    "    \n",
+    "    def transform(k1, k2):\n",
+    "        v1, v2 = d[k1], d[k2]\n",
+    "        return as_key(k1, k2), as_count(v1, v2)\n",
+    "    \n",
+    "    return [transform(k1, k2) for k1, k2 in combinations(d.keys(), 2)]\n",
+    "\n",
+    "def add_counts(a, b):\n",
+    "    return a[0] + b[0], a[1] + b[1], a[2] + b[2], a[3] + b[3]\n",
+    "\n",
+    "# to_counts({'x1': 1, 'x2': 1, 'x3': 1, 'x4': 1})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[(('x1', 'x2'), (207, 282, 231, 242)),\n",
+       " (('x1', 'x3'), (489, 0, 0, 473)),\n",
+       " (('x1', 'x4'), (207, 282, 231, 242)),\n",
+       " (('x2', 'x3'), (207, 231, 282, 242)),\n",
+       " (('x2', 'x4'), (438, 0, 0, 524)),\n",
+       " (('x3', 'x4'), (207, 282, 231, 242))]"
+      ]
+     },
+     "execution_count": 31,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sdf.rdd\\\n",
+    "    .flatMap(lambda r: to_counts(r.asDict()))\\\n",
+    "    .reduceByKey(lambda a, b: add_counts(a, b))\\\n",
+    "    .sortByKey()\\\n",
+    "    .collect()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/pypair/spark.py b/pypair/spark.py
@@ -0,0 +1,50 @@
+from itertools import combinations
+
+from pypair.contigency import BinaryMeasures
+
+
+def binary_binary(sdf):
+    def to_counts(d):
+        def as_key(k1, k2):
+            keys = sorted([k1, k2])
+            return keys[0], keys[1]
+
+        def as_count(v1, v2):
+            a, b, c, d = 0, 0, 0, 0
+            if v1 is not None and v2 is not None:
+                if v1 == 1 and v2 == 1:
+                    a = 1
+                elif v1 == 1 and v2 == 0:
+                    b = 1
+                elif v1 == 0 and v2 == 1:
+                    c = 1
+                else:
+                    d = 1
+            return a, b, c, d
+
+        def transform(k1, k2):
+            v1, v2 = d[k1], d[k2]
+            return as_key(k1, k2), as_count(v1, v2)
+
+        return [transform(k1, k2) for k1, k2 in combinations(d.keys(), 2)]
+
+    def add_counts(a, b):
+        return a[0] + b[0], a[1] + b[1], a[2] + b[2], a[3] + b[3]
+
+    def to_results(counts):
+        (x1, x2), (a, b, c, d) = counts
+        a = max(1, a)
+        b = max(1, b)
+        c = max(1, c)
+        d = max(1, d)
+        computer = BinaryMeasures(a, b, c, d)
+        measures = {m: computer.get(m) for m in computer.measures()}
+        return (x1, x2), measures
+
+    results = sdf.rdd \
+        .flatMap(lambda r: to_counts(r.asDict())) \
+        .reduceByKey(lambda a, b: add_counts(a, b)) \
+        .sortByKey() \
+        .map(lambda counts: to_results(counts)) \
+        .collect()
+    return {tup[0]: tup[1] for tup in results}
diff --git a/requirements.txt b/requirements.txt
@@ -10,6 +10,7 @@ numpy
 scipy
 pandas
 scikit-learn
+pyspark
 # DOCUMENTATION
 sphinx
 sphinx_rtd_theme
diff --git a/setup.py b/setup.py
@@ -14,7 +14,7 @@
     long_description_content_type='text/markdown',
     url='https://github.com/oneoffcoder/py-pair',
     keywords=' '.join(['statistics', 'pairwise', 'association', 'correlation', 'measurement', 'strength']),
-    install_requires=['scipy', 'numpy', 'pandas', 'scikit-learn'],
+    install_requires=['scipy', 'numpy', 'pandas', 'scikit-learn', 'pyspark'],
     classifiers=[
         'Programming Language :: Python :: 3',
         'License :: OSI Approved :: Apache Software License',
diff --git a/tests/test_spark.py b/tests/test_spark.py
@@ -0,0 +1,45 @@
+import logging
+import unittest
+
+import pandas as pd
+from pyspark.sql import SparkSession
+
+from pypair.spark import binary_binary
+
+
+class PySparkTest(unittest.TestCase):
+    @classmethod
+    def supress_py4j_logging(cls):
+        logger = logging.getLogger('py4j')
+        logger.setLevel(logging.WARN)
+
+    @classmethod
+    def create_pyspark_session(cls):
+        return (SparkSession.builder
+                .master('local[4]')
+                .appName('local-testing-pyspark')
+                .getOrCreate())
+
+    @classmethod
+    def setUpClass(cls):
+        cls.supress_py4j_logging()
+        cls.spark = cls.create_pyspark_session()
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.spark.stop()
+
+    def get_binary_binary_data(self):
+        get_data = lambda x, y, n: [(x, y) * 2 for _ in range(n)]
+        data = get_data(1, 1, 207) + get_data(1, 0, 282) + get_data(0, 1, 231) + get_data(0, 0, 242)
+        pdf = pd.DataFrame(data, columns=['x1', 'x2', 'x3', 'x4'])
+        sdf = self.spark.createDataFrame(pdf)
+        return sdf
+
+
+class BinaryBinaryTest(PySparkTest):
+    def test(self):
+        sdf = self.get_binary_binary_data()
+        result = binary_binary(sdf)
+        import json
+        print(json.dumps({f'{k[0]}_{k[1]}': v for k, v in result.items()}, indent=1))