In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext
import io, pandas as pd
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import StringIndexer

from pyspark.ml.feature import Normalizer

df = pd.read_csv('/home/pygmy/Projects/Veni/Bisnis_Pintar/Iris.csv')

df = spark.createDataFrame(df)

normalizer = Normalizer(p=2.0, inputCol="PetalWidthCm", outputCol="features")

### Normalizer

In [13]:
from pyspark.ml.linalg import Vectors

dataFrame = spark.createDataFrame([
    (0, Vectors.dense([1.0, 0.5, -1.0]),),
    (1, Vectors.dense([2.0, 1.0, 1.0]),),
    (2, Vectors.dense([4.0, 10.0, 2.0]),)
], ["id", "features"])

# Normalize each Vector using $L^1$ norm.
normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)
l1NormData = normalizer.transform(dataFrame)
print("Normalized using L^1 norm")
l1NormData.show()

Normalized using L^1 norm
+---+--------------+------------------+
| id|      features|      normFeatures|
+---+--------------+------------------+
|  0|[1.0,0.5,-1.0]|    [0.4,0.2,-0.4]|
|  1| [2.0,1.0,1.0]|   [0.5,0.25,0.25]|
|  2|[4.0,10.0,2.0]|[0.25,0.625,0.125]|
+---+--------------+------------------+



### Bucketizer

In [11]:
from pyspark.ml.feature import Bucketizer

In [12]:
values = [(0.1,), (0.4,), (1.2,), (1.5,), (float("nan"),), (float("nan"),)]
df = spark.createDataFrame(values, ["values"])


In [13]:
bucketizer = Bucketizer(splits=[-float("inf"), 0.5, 1.4, float("inf")],
                        inputCol="values", outputCol="buckets")

In [14]:
bucketed = bucketizer.setHandleInvalid("keep").transform(df)

In [15]:
bucketed.show()

+------+-------+
|values|buckets|
+------+-------+
|   0.1|    0.0|
|   0.4|    0.0|
|   1.2|    1.0|
|   1.5|    2.0|
|   NaN|    3.0|
|   NaN|    3.0|
+------+-------+



### PCA

In [3]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import PCA
data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),
        (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),
        (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)]
df = spark.createDataFrame(data,["features"])

In [4]:
pca = PCA(k=2, inputCol="features", outputCol="pca_features")

In [5]:
pca = pca.fit(df)

In [6]:
pca = pca.transform(df)

In [7]:
pca.show()

+--------------------+--------------------+
|            features|        pca_features|
+--------------------+--------------------+
| (5,[1,3],[1.0,7.0])|[1.64857282308838...|
|[2.0,0.0,3.0,4.0,...|[-4.6451043317815...|
|[4.0,0.0,0.0,6.0,...|[-6.4288805356764...|
+--------------------+--------------------+



### Standar Scaler

In [18]:
from pyspark.ml.feature import StandardScaler

In [20]:
from pyspark.ml.linalg import Vectors
df = spark.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], ["a"])
model = StandardScaler(inputCol="a", outputCol="scaled")
model = model.fit(df)
model.mean

DenseVector([1.0])

In [22]:
model.transform(df).show()

+-----+-------------------+
|    a|             scaled|
+-----+-------------------+
|[0.0]|              [0.0]|
|[2.0]|[1.414213562373095]|
+-----+-------------------+



### Tokenizer

In [2]:
from pyspark.ml.feature import Tokenizer

In [3]:
df = pd.read_csv("/home/pygmy/Projects/Moti/public.items.csv", sep="|")
df = spark.createDataFrame(df)
tokenizer = Tokenizer(inputCol="items", outputCol="words")
tokenizer = tokenizer.transform(df)
tokenizer.show()

+---+--------------------+--------------------+
| id|               items|               words|
+---+--------------------+--------------------+
|  0|Raw Shrimp, Seedl...|[raw, shrimp,, se...|
|  1|Cracked Wheat, St...|[cracked, wheat,,...|
|  2|Beet Apple Carrot...|[beet, apple, car...|
|  3|               Vodka|             [vodka]|
|  4|Globe Eggplant, P...|[globe, eggplant,...|
|  5|Organic Baby Spin...|[organic, baby, s...|
|  6|Reduced Fat Crack...|[reduced, fat, cr...|
|  7|Organic Red Onion...|[organic, red, on...|
|  8|Organic Cripps Pi...|[organic, cripps,...|
|  9|Organic Baby Spin...|[organic, baby, s...|
| 10|Uncured Beef Hot ...|[uncured, beef, h...|
| 11|Donut House Choco...|[donut, house, ch...|
| 12|[Concentrated But...|[[concentrated, b...|
| 13|Raspberries, Gree...|[raspberries,, gr...|
| 14|Original Tofurky ...|[original, tofurk...|
| 15|Extra Hold Non-Ae...|[extra, hold, non...|
| 16|Organic Coconut M...|[organic, coconut...|
| 17|No. 485 Gin, Mont...|[no., 485, gin

### StopWords

In [4]:
from pyspark.ml.feature import StopWordsRemover

remover = StopWordsRemover(inputCol="words", outputCol="removed")
remover.transform(tokenizer).show()

+---+--------------------+--------------------+--------------------+
| id|               items|               words|             removed|
+---+--------------------+--------------------+--------------------+
|  0|Raw Shrimp, Seedl...|[raw, shrimp,, se...|[raw, shrimp,, se...|
|  1|Cracked Wheat, St...|[cracked, wheat,,...|[cracked, wheat,,...|
|  2|Beet Apple Carrot...|[beet, apple, car...|[beet, apple, car...|
|  3|               Vodka|             [vodka]|             [vodka]|
|  4|Globe Eggplant, P...|[globe, eggplant,...|[globe, eggplant,...|
|  5|Organic Baby Spin...|[organic, baby, s...|[organic, baby, s...|
|  6|Reduced Fat Crack...|[reduced, fat, cr...|[reduced, fat, cr...|
|  7|Organic Red Onion...|[organic, red, on...|[organic, red, on...|
|  8|Organic Cripps Pi...|[organic, cripps,...|[organic, cripps,...|
|  9|Organic Baby Spin...|[organic, baby, s...|[organic, baby, s...|
| 10|Uncured Beef Hot ...|[uncured, beef, h...|[uncured, beef, h...|
| 11|Donut House Choco...|[donut, 

In [9]:
tokenizer

DataFrame[id: bigint, items: string, words: array<string>]

In [37]:
from pyspark.ml.feature import StopWordsRemover

sentenceData = spark.createDataFrame([
    (0, ["I", "saw", "the", "red", "balloon"]),
    (1, ["Mary", "had", "a", "little", "lamb"])
], ["id", "raw"])

remover = StopWordsRemover(inputCol="raw", outputCol="filtered")
remover.transform(sentenceData).show(truncate=False)

+---+----------------------------+--------------------+
|id |raw                         |filtered            |
+---+----------------------------+--------------------+
|0  |[I, saw, the, red, balloon] |[saw, red, balloon] |
|1  |[Mary, had, a, little, lamb]|[Mary, little, lamb]|
+---+----------------------------+--------------------+



In [38]:
remover.transform(sentenceData).show()

+---+--------------------+--------------------+
| id|                 raw|            filtered|
+---+--------------------+--------------------+
|  0|[I, saw, the, red...| [saw, red, balloon]|
|  1|[Mary, had, a, li...|[Mary, little, lamb]|
+---+--------------------+--------------------+



### One Hot Encoder

In [2]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer

df = spark.createDataFrame([
    (0, "a"),
    (1, "b"),
    (2, "c"),
    (3, "a"),
    (4, "a"),
    (5, "c")
], ["id", "category"])

stringIndexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
model = stringIndexer.fit(df)
indexed = model.transform(df)

encoder = OneHotEncoder(inputCol="categoryIndex", outputCol="categoryVec")
encoded = encoder.transform(indexed)
encoded.show()


+---+--------+-------------+-------------+
| id|category|categoryIndex|  categoryVec|
+---+--------+-------------+-------------+
|  0|       a|          0.0|(2,[0],[1.0])|
|  1|       b|          2.0|    (2,[],[])|
|  2|       c|          1.0|(2,[1],[1.0])|
|  3|       a|          0.0|(2,[0],[1.0])|
|  4|       a|          0.0|(2,[0],[1.0])|
|  5|       c|          1.0|(2,[1],[1.0])|
+---+--------+-------------+-------------+



In [3]:
df = pd.read_csv("/home/pygmy/Projects/Moti/public.items.csv", sep="|")
df = spark.createDataFrame(df)

In [4]:
df.show(5)


+---+--------------------+
| id|               items|
+---+--------------------+
|  0|Raw Shrimp, Seedl...|
|  1|Cracked Wheat, St...|
|  2|Beet Apple Carrot...|
|  3|               Vodka|
|  4|Globe Eggplant, P...|
+---+--------------------+
only showing top 5 rows



In [5]:
stringIndexer = StringIndexer(inputCol="items", outputCol="categoryIndex")

In [7]:
model = stringIndexer.fit(df)
indexed = model.transform(df)

In [8]:
indexed.show()

+---+--------------------+-------------+
| id|               items|categoryIndex|
+---+--------------------+-------------+
|  0|Raw Shrimp, Seedl...|       3304.0|
|  1|Cracked Wheat, St...|      12898.0|
|  2|Beet Apple Carrot...|       1166.0|
|  3|               Vodka|        103.0|
|  4|Globe Eggplant, P...|     116559.0|
|  5|Organic Baby Spin...|     100966.0|
|  6|Reduced Fat Crack...|      86958.0|
|  7|Organic Red Onion...|      79118.0|
|  8|Organic Cripps Pi...|      69377.0|
|  9|Organic Baby Spin...|     104739.0|
| 10|Uncured Beef Hot ...|     106295.0|
| 11|Donut House Choco...|       3993.0|
| 12|[Concentrated But...|      39701.0|
| 13|Raspberries, Gree...|     114490.0|
| 14|Original Tofurky ...|      15893.0|
| 15|Extra Hold Non-Ae...|      58880.0|
| 16|Organic Coconut M...|      69949.0|
| 17|No. 485 Gin, Mont...|      37464.0|
| 18|Red Vine Tomato, ...|      93873.0|
| 19|Organic Baby Arug...|      84761.0|
+---+--------------------+-------------+
only showing top

In [9]:
encoder = OneHotEncoder(inputCol="categoryIndex", outputCol="categoryVec")
encoded = encoder.transform(indexed)
encoded.show()

+---+--------------------+-------------+--------------------+
| id|               items|categoryIndex|         categoryVec|
+---+--------------------+-------------+--------------------+
|  0|Raw Shrimp, Seedl...|       3304.0|(126765,[3304],[1...|
|  1|Cracked Wheat, St...|      12898.0|(126765,[12898],[...|
|  2|Beet Apple Carrot...|       1166.0|(126765,[1166],[1...|
|  3|               Vodka|        103.0|(126765,[103],[1.0])|
|  4|Globe Eggplant, P...|     116559.0|(126765,[116559],...|
|  5|Organic Baby Spin...|     100966.0|(126765,[100966],...|
|  6|Reduced Fat Crack...|      86958.0|(126765,[86958],[...|
|  7|Organic Red Onion...|      79118.0|(126765,[79118],[...|
|  8|Organic Cripps Pi...|      69377.0|(126765,[69377],[...|
|  9|Organic Baby Spin...|     104739.0|(126765,[104739],...|
| 10|Uncured Beef Hot ...|     106295.0|(126765,[106295],...|
| 11|Donut House Choco...|       3993.0|(126765,[3993],[1...|
| 12|[Concentrated But...|      39701.0|(126765,[39701],[...|
| 13|Ras

### Min Max Scaler

In [12]:
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.linalg import Vectors

In [13]:
df = spark.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], ["a"])

In [14]:
mmScaler = MinMaxScaler(inputCol="a", outputCol="scaled")

In [16]:
model = mmScaler.fit(df)

In [21]:
model = model.transform(df)

In [22]:
model.show()

+-----+------+
|    a|scaled|
+-----+------+
|[0.0]| [0.0]|
|[2.0]| [1.0]|
+-----+------+



In [2]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer

df = spark.createDataFrame([
    (0, "a"),
    (1, "b"),
    (2, "c"),
    (3, "a"),
    (4, "a"),
    (5, "c")
], ["id", "category"])

stringIndexer = StringIndexer(inputCol="category", outputCol="categoryIndex", handleInvalid="error",
                              stringOrderType="frequencyDesc")
model = stringIndexer.fit(df)
indexed = model.transform(df)

encoder = OneHotEncoder(inputCol="categoryIndex", outputCol="categoryVec")
encoded = encoder.transform(indexed)
encoded.show()


+---+--------+-------------+-------------+
| id|category|categoryIndex|  categoryVec|
+---+--------+-------------+-------------+
|  0|       a|          0.0|(2,[0],[1.0])|
|  1|       b|          2.0|    (2,[],[])|
|  2|       c|          1.0|(2,[1],[1.0])|
|  3|       a|          0.0|(2,[0],[1.0])|
|  4|       a|          0.0|(2,[0],[1.0])|
|  5|       c|          1.0|(2,[1],[1.0])|
+---+--------+-------------+-------------+

