In [2]:
#Testing pyspark installation
import findspark
findspark.init()
findspark.find()
import pyspark
findspark.find()

'C:\\spark-3.0.0-bin-hadoop2.7\\spark-3.0.0-bin-hadoop2.7'

In [3]:
#Initiate Spark Context
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession


In [4]:
conf = pyspark.SparkConf().setAppName('SparkApp').setMaster('local')
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession(sc)

In [5]:
#Example Test Code
numeric_val = sc.parallelize([1,2,3,4])
numeric_val.map(lambda x: x*x*x).collect()

[1, 8, 27, 64]

In [6]:
from pyspark.ml.feature import Word2Vec

In [7]:
# Input data: Each row is a bag of words from a sentence or document.
documentDF = spark.createDataFrame([
    ("Hi I heard about Spark".split(" "), ),
    ("I wish Java could use case classes".split(" "), ),
    ("Logistic regression models are neat".split(" "), )
], ["text"])

# Learn a mapping from words to Vectors.
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result")
model = word2Vec.fit(documentDF)

result = model.transform(documentDF)
for row in result.collect():
    text, vector = row
    print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector)))

Text: [Hi, I, heard, about, Spark] => 
Vector: [0.031109690852463248,-0.020681756734848025,0.009653565101325513]

Text: [I, wish, Java, could, use, case, classes] => 
Vector: [0.01842194888740778,0.03821451057280813,0.06269602593965828]

Text: [Logistic, regression, models, are, neat] => 
Vector: [-0.005502792075276375,0.001944764330983162,0.008235244452953339]



# Krish exercise

In [8]:
from pyspark.sql import SparkSession
spark= SparkSession.builder.appName('Customers').getOrCreate()

In [9]:
from pyspark.ml.regression import LinearRegression

In [10]:
dataset=spark.read.csv("C:\\Users\\2304373.UNIPHOREIND\\Pictures\\pyspark_exercises\\PysparkRegressions-master\\Ecommerce_Customers.csv",inferSchema=True,header=True)

In [11]:
dataset

DataFrame[Email: string, Address: string, Avg Session Length: double, Time on App: double, Time on Website: double, Length of Membership: double, Yearly Amount Spent: double]

In [14]:
dataset.show() # like head()

+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+
|               Email|             Address|Avg Session Length|Time on App|Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|       34.49726773|12.65565115|    39.57766802|         4.082620633|         587.951054|
|   hduke@hotmail.com|4547 Archer Commo...|       31.92627203|11.10946073|    37.26895887|         2.664034182|        392.2049334|
|    pallen@yahoo.com|24645 Valerie Uni...|       33.00091476|11.33027806|    37.11059744|         4.104543202|        487.5475049|
|riverarebecca@gma...|1414 David Throug...|       34.30555663|13.71751367|    36.72128268|         3.120178783|         581.852344|
|mstephens@davidso...|14023 Rodriguez P...|       33.33067252|12.79518855|  

In [15]:
dataset.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [16]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [17]:
featureassembler=VectorAssembler(inputCols=["Avg Session Length","Time on App","Time on Website","Length of Membership"],outputCol="Independent Features")

In [18]:
output=featureassembler.transform(dataset)

In [19]:
output.show()

+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+--------------------+
|               Email|             Address|Avg Session Length|Time on App|Time on Website|Length of Membership|Yearly Amount Spent|Independent Features|
+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+--------------------+
|mstephenson@ferna...|835 Frank TunnelW...|       34.49726773|12.65565115|    39.57766802|         4.082620633|         587.951054|[34.49726773,12.6...|
|   hduke@hotmail.com|4547 Archer Commo...|       31.92627203|11.10946073|    37.26895887|         2.664034182|        392.2049334|[31.92627203,11.1...|
|    pallen@yahoo.com|24645 Valerie Uni...|       33.00091476|11.33027806|    37.11059744|         4.104543202|        487.5475049|[33.00091476,11.3...|
|riverarebecca@gma...|1414 David Throug...|       34.30555663|13.71751367|    36.7

In [20]:
output.select("Independent Features").show()

+--------------------+
|Independent Features|
+--------------------+
|[34.49726773,12.6...|
|[31.92627203,11.1...|
|[33.00091476,11.3...|
|[34.30555663,13.7...|
|[33.33067252,12.7...|
|[33.87103788,12.0...|
|[32.0215955,11.36...|
|[32.73914294,12.3...|
|[33.9877729,13.38...|
|[31.93654862,11.8...|
|[33.99257277,13.3...|
|[33.87936082,11.5...|
|[29.53242897,10.9...|
|[33.19033404,12.9...|
|[32.38797585,13.1...|
|[30.73772037,12.6...|
|[32.1253869,11.73...|
|[32.33889932,12.0...|
|[32.18781205,14.7...|
|[32.61785606,13.9...|
+--------------------+
only showing top 20 rows



In [21]:
finalized_data=output.select("Independent Features","Yearly Amount Spent")

In [22]:
finalized_data.show()

+--------------------+-------------------+
|Independent Features|Yearly Amount Spent|
+--------------------+-------------------+
|[34.49726773,12.6...|         587.951054|
|[31.92627203,11.1...|        392.2049334|
|[33.00091476,11.3...|        487.5475049|
|[34.30555663,13.7...|         581.852344|
|[33.33067252,12.7...|         599.406092|
|[33.87103788,12.0...|        637.1024479|
|[32.0215955,11.36...|        521.5721748|
|[32.73914294,12.3...|        549.9041461|
|[33.9877729,13.38...|         570.200409|
|[31.93654862,11.8...|        427.1993849|
|[33.99257277,13.3...|        492.6060127|
|[33.87936082,11.5...|        522.3374046|
|[29.53242897,10.9...|        408.6403511|
|[33.19033404,12.9...|        573.4158673|
|[32.38797585,13.1...|        470.4527333|
|[30.73772037,12.6...|        461.7807422|
|[32.1253869,11.73...|        457.8476959|
|[32.33889932,12.0...|        407.7045475|
|[32.18781205,14.7...|        452.3156755|
|[32.61785606,13.9...|        605.0610388|
+----------

In [23]:
train_data,test_data=finalized_data.randomSplit([0.75,0.25])

In [24]:
regressor=LinearRegression(featuresCol='Independent Features', labelCol='Yearly Amount Spent')
regressor=regressor.fit(train_data)

In [25]:
regressor.coefficients

DenseVector([25.916, 38.7533, 0.6489, 61.5527])

In [26]:
regressor.intercept

-1065.3389282952464

In [27]:
pred_results=regressor.evaluate(test_data)

In [28]:
pred_results.predictions.show(40)

+--------------------+-------------------+------------------+
|Independent Features|Yearly Amount Spent|        prediction|
+--------------------+-------------------+------------------+
|[30.4925367,11.56...|        282.4712457| 287.5511288561554|
|[30.83643267,13.1...|        467.5019004| 471.7083721034478|
|[31.38958548,10.9...|        410.0696111|409.97897336753044|
|[31.42522688,13.2...|        530.7667187| 535.1279124968523|
|[31.44597248,12.8...|        484.8769649|482.55283577125215|
|[31.44744649,10.1...|        418.6027421| 426.6904772628857|
|[31.5171218,10.74...|        275.9184207|280.99070388139353|
|[31.52575242,11.3...|        443.9656268| 449.7747349811259|
|[31.60983957,12.7...|        444.5455497| 427.6610748286673|
|[31.62536013,13.1...|        376.3369008| 381.3716098471659|
|[31.65480968,13.0...|        475.2634237| 469.1891464882608|
|[31.66104982,11.3...|        416.3583536|417.51858802846596|
|[31.73663569,10.7...|        496.9334463|494.52048045248057|
|[31.812

# Accessing example files in spark

In [33]:
#Testing pyspark installation
import findspark
findspark.init()
findspark.find()
import pyspark
findspark.find()


'C:\\spark-3.0.0-bin-hadoop2.7\\spark-3.0.0-bin-hadoop2.7'

In [34]:
from pyspark.ml.clustering import LDA

# Loads data.
dataset = spark.read.format("libsvm").load("C:\\spark-3.0.0-bin-hadoop2.7\\spark-3.0.0-bin-hadoop2.7\\data\\mllib\\sample_lda_libsvm_data.txt")

In [35]:
dataset

DataFrame[label: double, features: vector]

In [36]:
# Trains a LDA model.
lda = LDA(k=10, maxIter=10)
model = lda.fit(dataset)

ll = model.logLikelihood(dataset)
lp = model.logPerplexity(dataset)
print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
print("The upper bound on perplexity: " + str(lp))

# Describe topics.
topics = model.describeTopics(3)
print("The topics described by their top-weighted terms:")
topics.show(truncate=False)

# Shows the result
transformed = model.transform(dataset)
transformed.show(truncate=False)

The lower bound on the log likelihood of the entire corpus: -820.7717059783324
The upper bound on perplexity: 3.156814253762817
The topics described by their top-weighted terms:
+-----+-----------+---------------------------------------------------------------+
|topic|termIndices|termWeights                                                    |
+-----+-----------+---------------------------------------------------------------+
|0    |[0, 3, 5]  |[0.0988907776639746, 0.09840811080140031, 0.09815784171623272] |
|1    |[1, 4, 9]  |[0.17372581481263155, 0.1541451633717008, 0.14809304265019235] |
|2    |[1, 2, 0]  |[0.10763518719710427, 0.09290001778648123, 0.09285769471788125]|
|3    |[1, 7, 8]  |[0.10278233443057151, 0.10257897851841583, 0.09964766524551634]|
|4    |[6, 7, 1]  |[0.10991496236314273, 0.10762017498569953, 0.0934346815074767] |
|5    |[8, 2, 6]  |[0.10265802360235843, 0.10194550822222663, 0.09453499013679116]|
|6    |[3, 7, 8]  |[0.1061840707195339, 0.10315547141829495, 0.098