### Analyse search terms on the e-commerce web server


In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425344 sha256=d53f46e4524b40b995f440d22dec80a8eaf91705afbd31d7e930b18818686ee9
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [None]:
# Install spark
import pyspark
import pandas as pd


In [None]:
!pip install findspark

Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [None]:
import findspark
findspark.init()

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [None]:
# Start session\
sc = SparkContext()

# Creating a spark session
spark = SparkSession \
    .builder \
    .appName("SparkML Ops").getOrCreate()

In [None]:
spark

In [None]:
# Download The search term dataset from the below url
# https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/searchterms.csv

In [None]:
import requests


In [None]:
response = requests.get("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/searchterms.csv")

In [None]:
open("searchterms.csv", "wb").write(response.content)

233457

In [None]:
# Load the csv into a spark dataframe
searchterms = pd.read_csv('searchterms.csv')
sdf = spark.createDataFrame(searchterms)

In [None]:
# Print the number of rows and columns
print("Number of rows in the CSV file: ",len(searchterms))
print("Number of columns in the CSV file:",len(searchterms.columns))


Number of rows in the CSV file:  10000
Number of columns in the CSV file: 4


In [None]:
# Print the top 5 rows
searchterms.head()

Unnamed: 0,day,month,year,searchterm
0,12,11,2021,mobile 6 inch
1,12,11,2021,mobile latest
2,12,11,2021,tablet wifi
3,12,11,2021,laptop 14 inch
4,12,11,2021,mobile 5g


In [None]:
# Find out the datatype of the column searchterm?
searchterms.dtypes

day            int64
month          int64
year           int64
searchterm    object
dtype: object

In [None]:
sdf.printSchema()

root
 |-- day: long (nullable = true)
 |-- month: long (nullable = true)
 |-- year: long (nullable = true)
 |-- searchterm: string (nullable = true)



In [None]:
# How many times was the term `gaming laptop` searched?
sdf.createOrReplaceTempView("sdf")
spark.sql("""select count(*) as gaming_laptop from sdf where searchterm='gaming laptop'""").show()

+-------------+
|gaming_laptop|
+-------------+
|          499|
+-------------+



In [None]:
# Print the top 5 most frequently used search terms?
spark.sql("""select count(*), searchterm as gaming_laptop from sdf group by searchterm order by count(*) desc""").show(5)

+--------+-------------+
|count(1)|gaming_laptop|
+--------+-------------+
|    2312|mobile 6 inch|
|    2301|    mobile 5g|
|    1327|mobile latest|
|     935|       laptop|
|     896|  tablet wifi|
+--------+-------------+
only showing top 5 rows



In [None]:
# The pretrained sales forecasting model is available at  the below url
# https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/model.tar.gz

In [None]:
# Load the sales forecast model.
!wget --no-check-certificate https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/model.gzip
!tar -xvzf model.gzip

--2023-10-22 15:31:59--  https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/model.gzip
Resolving cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)... 169.45.118.108
Connecting to cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)|169.45.118.108|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1722 (1.7K) [application/gzip]
Saving to: ‘model.gzip’


2023-10-22 15:32:00 (667 MB/s) - ‘model.gzip’ saved [1722/1722]

sales_prediction.model/
sales_prediction.model/metadata/
sales_prediction.model/metadata/part-00000
sales_prediction.model/metadata/.part-00000.crc
sales_prediction.model/metadata/_SUCCESS
sales_prediction.model/metadata/._SUCCESS.crc
sales_prediction.model/data/
sales_prediction.model/data/part-00000-f37d8b09-cd1a-426c-ba90-4047208b011b-c000.snappy.par

In [None]:
# Using the sales forecast model, predict the sales for the year of 2023.

In [30]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegressionModel

In [31]:

model = LinearRegressionModel.load('sales_prediction.model')

In [32]:
def predict(year):
    assembler = VectorAssembler(inputCols=["year"],outputCol="features")
    data = [[year,0]]
    columns = ["year", "sales"]
    _ = spark.createDataFrame(data, columns)
    __ = assembler.transform(_).select('features', 'year')
    predictions = model.transform(__)
    predictions.select('prediction').show()

predict(2023)

+------------------+
|        prediction|
+------------------+
|176.14285712605306|
+------------------+

