In [1]:
import findspark
findspark.init()
findspark.find()
import pyspark

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, max, min, sum, mean
# from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType

In [3]:
spark = SparkSession.builder.appName("ScalaToPySpark").getOrCreate()
sc = spark.sparkContext

In [4]:
# JDBC connection properties
jdbc_hostname = "localhost"
jdbc_port = 3306
jdbc_url = f"jdbc:mysql://{jdbc_hostname}:{jdbc_port}/information_schema"

In [5]:
connection_properties = {
    "user": "root",
    "password": "Root@123",
    "driver": "com.mysql.cj.jdbc.Driver"
}

In [6]:
# Only select SCHEMA_NAME to avoid unsupported MySQL column types
df_databases = spark.read.jdbc(
    url=jdbc_url,
    table="(SELECT SCHEMA_NAME FROM SCHEMATA) AS dbs",
    properties=connection_properties
)

In [7]:
df_databases.show(truncate=False)

+------------------+
|SCHEMA_NAME       |
+------------------+
|mysql             |
|information_schema|
|performance_schema|
|sys               |
|sparkdb           |
+------------------+



In [8]:
# Switch to sparkdb
database = "sparkdb"
jdbc_url_sparkdb = f"jdbc:mysql://{jdbc_hostname}:{jdbc_port}/{database}"


In [9]:
df_emp = spark.read.jdbc(
    url=jdbc_url_sparkdb,
    table="emp",  # table name in sparkdb
    properties=connection_properties
)

In [10]:
df_emp.show()

+-----+-----------+-----+
|empid|      ename| esal|
+-----+-----------+-----+
|  100|    Abhiran|14000|
|  101|     Bhavya|24000|
|  102|    Aravind|54000|
|  103|   Mohanlal|34000|
|  104|     Rakesh|84000|
|  105|      Danya|58000|
|  106|      Eswar|29000|
|  107|     Faisal|19000|
|  108|     Venkat|74000|
|  109| RajVardhan|87000|
|  110|  SekharRaj|56000|
|  111|    Mounika|39000|
|  112|    Vardhan|64000|
|  113|    Richard|68000|
|  114|      Bruce|96000|
|  115|   Balamani|49000|
|  116|RamChandran|27000|
|  117|   Rajsekar|88000|
|  118|     Ravali|68000|
|  119|      Nasal|50000|
+-----+-----------+-----+
only showing top 20 rows



In [11]:
df_emp.count()

21

In [15]:
df_emp.write.mode("overwrite").csv("file:///C:/Users/aksha/Pyspark/df_emp_csv")

In [16]:
df_emp.write.mode("overwrite").json("file:///C:/Users/aksha/Pyspark/df_emp_json")

In [17]:
df_emp.filter(col("esal") > 40000).orderBy(col("ename").desc()).show()

+-----+----------+-----+
|empid|     ename| esal|
+-----+----------+-----+
|  108|    Venkat|74000|
|  112|   Vardhan|64000|
|  110| SekharRaj|56000|
|  113|   Richard|68000|
|  118|    Ravali|68000|
|  104|    Rakesh|84000|
|  117|  Rajsekar|88000|
|  109|RajVardhan|87000|
|  119|     Nasal|50000|
|  105|     Danya|58000|
|  114|     Bruce|96000|
|  115|  Balamani|49000|
|  102|   Aravind|54000|
+-----+----------+-----+



In [18]:
df_emp.printSchema()

root
 |-- empid: integer (nullable = true)
 |-- ename: string (nullable = true)
 |-- esal: integer (nullable = true)



In [19]:
jdbc_url = "jdbc:mysql://localhost:3306/sparkdb"  # Target YOUR database
connection_properties = {
    "user": "root",
    "password": "Root@123",
    "driver": "com.mysql.cj.jdbc.Driver"
}

In [20]:
# Write to a new table named "high_salary_employees"
df_emp.write.jdbc(
        url=jdbc_url,
        table="resulttab",  # New table name
        mode="overwrite",               # Options: "overwrite", "append", "ignore", "error"
        properties=connection_properties
    )

In [21]:
spark.sql("show databases").show()

+---------+
|namespace|
+---------+
|  default|
+---------+



In [None]:
# spark.sql("select count(*) from batchi28.parttable").show() when connected to hive and access database and tables

In [None]:
#spark.sql("create table testtab(cid int,cnam string,cheadcnt int,cloc string") row format delimited fields terminated by '|' lines terminated by '\n'
#stored as textfile location 'hdfs:localhost:8020/user/hive/warehouse/testtab').show()

In [None]:
#spark.sql("load dta local inpath'file:///............../empdata.log'").show()

In [None]:
data = sc.parallelize(range(1,31))

In [None]:
mapdata = data.map(lambda x:(x,x*x))

In [None]:
df = mapdata.toDF(["number","Square"])

In [None]:
df.show()

In [None]:
# df.write.saveAsTable("Hivetab20") #Directly saves the table in the hive

In [None]:
#Hive 
#show databases
#show tables in batch128  here batch128 is a database name

In [None]:
#need to configure hive in spark environment so that we can access the hive environment and its data

In [None]:
#spark.sql("show tables in batch128").show() this is the query in pyspark

In [None]:
#Hive integration with spark sql
#Step 1: We have to copy metastore configuration file of hive to Spark Conf dir 
#Step 2: We have to copy Hadoop master node information(metadata info of storage) to Spark Conf dir  - core-site.xml
#Step 3: We have to copy back up mechanism info of Hadoop to Spark Conf dir - hdfs-site.xml

In [None]:
#spark.sql("select count(*) from batch128.tablename").show()