In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext

In [2]:
spark

In [3]:
a = spark.createDataFrame([(0, ["a", "b", "c"]), (1, ["a", "b", "b", "c", "a"])],
                          ["label", "raw"])

In [4]:
a

DataFrame[label: bigint, raw: array<string>]

In [5]:
from pyspark.ml.feature import CountVectorizer
cv = CountVectorizer(inputCol="raw", outputCol="vectors")
model = cv.fit(a)

In [6]:
model = model.transform(a)

In [7]:
model.show()

+-----+---------------+--------------------+
|label|            raw|             vectors|
+-----+---------------+--------------------+
|    0|      [a, b, c]|(3,[0,1,2],[1.0,1...|
|    1|[a, b, b, c, a]|(3,[0,1,2],[2.0,2...|
+-----+---------------+--------------------+



In [8]:
import pandas as pd 
df2 = pd.read_csv('/home/pygmy/Projects/Moti/public.items.csv', sep='|')

In [9]:
from pyspark.ml.feature import Tokenizer
df2 = spark.createDataFrame(df2)


In [10]:
df2.show(2)

+---+--------------------+
| id|               items|
+---+--------------------+
|  0|Raw Shrimp, Seedl...|
|  1|Cracked Wheat, St...|
+---+--------------------+
only showing top 2 rows



In [11]:
tokenized = Tokenizer(inputCol="items", outputCol="tokenized")

In [12]:
tokenized = tokenized.transform(df2)

In [13]:
tokenized.show()

+---+--------------------+--------------------+
| id|               items|           tokenized|
+---+--------------------+--------------------+
|  0|Raw Shrimp, Seedl...|[raw, shrimp,, se...|
|  1|Cracked Wheat, St...|[cracked, wheat,,...|
|  2|Beet Apple Carrot...|[beet, apple, car...|
|  3|               Vodka|             [vodka]|
|  4|Globe Eggplant, P...|[globe, eggplant,...|
|  5|Organic Baby Spin...|[organic, baby, s...|
|  6|Reduced Fat Crack...|[reduced, fat, cr...|
|  7|Organic Red Onion...|[organic, red, on...|
|  8|Organic Cripps Pi...|[organic, cripps,...|
|  9|Organic Baby Spin...|[organic, baby, s...|
| 10|Uncured Beef Hot ...|[uncured, beef, h...|
| 11|Donut House Choco...|[donut, house, ch...|
| 12|[Concentrated But...|[[concentrated, b...|
| 13|Raspberries, Gree...|[raspberries,, gr...|
| 14|Original Tofurky ...|[original, tofurk...|
| 15|Extra Hold Non-Ae...|[extra, hold, non...|
| 16|Organic Coconut M...|[organic, coconut...|
| 17|No. 485 Gin, Mont...|[no., 485, gin

In [14]:
from pyspark.ml.feature import CountVectorizer
countvector = CountVectorizer(inputCol="tokenized", outputCol="countvectorized")

In [15]:
countvector = countvector.fit(tokenized)

In [16]:
countvector = countvector.transform(tokenized)

In [17]:
countvector.show()

+---+--------------------+--------------------+--------------------+
| id|               items|           tokenized|     countvectorized|
+---+--------------------+--------------------+--------------------+
|  0|Raw Shrimp, Seedl...|[raw, shrimp,, se...|(13637,[0,11,26,3...|
|  1|Cracked Wheat, St...|[cracked, wheat,,...|(13637,[0,2,5,7,8...|
|  2|Beet Apple Carrot...|[beet, apple, car...|(13637,[0,45,88,1...|
|  3|               Vodka|             [vodka]|(13637,[1141],[1.0])|
|  4|Globe Eggplant, P...|[globe, eggplant,...|(13637,[0,1,31,37...|
|  5|Organic Baby Spin...|[organic, baby, s...|(13637,[0,2,4,5,6...|
|  6|Reduced Fat Crack...|[reduced, fat, cr...|(13637,[1,3,5,11,...|
|  7|Organic Red Onion...|[organic, red, on...|(13637,[0,1,2,5,7...|
|  8|Organic Cripps Pi...|[organic, cripps,...|(13637,[0,26,35,5...|
|  9|Organic Baby Spin...|[organic, baby, s...|(13637,[0,6,9,12,...|
| 10|Uncured Beef Hot ...|[uncured, beef, h...|(13637,[0,6,8,10,...|
| 11|Donut House Choco...|[donut, 