### Required Imports 

In [1]:
from kafka import KafkaConsumer
import json
import pymongo as mdb
from pymongo import MongoClient
import pandas as pd

### Define Kafka Broker and Topic

In [2]:
bootstrap_servers = 'kafka:9092'
topic_name = 'nba-WEB-data'

### Creation of MongoDB-Client and the connection with Server

In [None]:
client = MongoClient("mongodb://pt-n20.p4001.w3.cs.technikum-wien.at:4001")

### Selection of MongoDB-Database and -Collection

In [3]:
mdb = client.nba_data
collection = mdb.season_stats_web

### Create and configure Kafka-Consumer

In [4]:
consumer = KafkaConsumer(
    topic_name,
    bootstrap_servers=bootstrap_servers,
    key_deserializer=lambda x: x.decode('utf-8'),
    value_deserializer=lambda x: json.loads(x.decode('utf-8'))
)

### Empty Array to save the messages

In [3]:
message_list = []

### Fetch messages from the Kafka consumer and add them to the array

In [8]:
for message in consumer:
    data = message.value
    message_list.append(data)
print("Exited")

Exited


### Close Kafka-Consumer

In [7]:
consumer.close()

### Creation of a Pandas DataFrame

In [9]:
pandas_df=pd.DataFrame(message_list)

### Shape of DataFrames

In [10]:
pandas_df.shape

(3676, 30)

### The first lines of DataFrames are shown

In [11]:
pandas_df.head()

Unnamed: 0,season,PLAYER_NAME,TEAM_ABBREVIATION,AGE,GP,W,L,MIN,PTS,FGM,...,REB,AST,TOV,STL,BLK,PF,NBA_FANTASY_PTS,DD2,TD3,PLUS_MINUS
0,1996,Michael Jordan,CHI,34,82,69,13,37.9,29.6,11.2,...,5.9,4.3,2.0,1.7,0.5,1.9,47.8,9.0,1.0,10.0
1,1996,Karl Malone,UTA,33,82,64,18,36.6,27.4,10.5,...,9.9,4.5,2.8,1.4,0.6,2.6,49.0,43.0,1.0,9.4
2,1996,Glen Rice,CHH,30,79,52,27,42.5,26.8,9.1,...,4.0,2.0,2.2,0.9,0.3,2.4,36.1,0.0,0.0,2.2
3,1996,Shaquille O'Neal,LAL,25,51,38,13,38.1,26.2,10.8,...,12.5,3.1,2.9,0.9,2.9,3.5,54.4,44.0,0.0,5.2
4,1996,Mitch Richmond,SAC,32,81,34,47,38.6,25.9,8.9,...,3.9,4.2,2.9,1.5,0.3,2.5,39.2,4.0,1.0,-1.9


## Data Cleaning

### Required Imports for Spark

In [12]:
from pyspark.sql import SparkSession


### Spark Session

In [13]:
spark = SparkSession.builder \
    .appName("DataCleaningSpark") \
    .getOrCreate()

### Creation of Spark DataFrame

In [14]:
spark_df = spark.createDataFrame(pandas_df)


  for column, series in pdf.iteritems():
  for column, series in pdf.iteritems():


### Datatype-Schema of for each Colum in Spark-DF

In [15]:
spark_df.printSchema()

root
 |-- season: string (nullable = true)
 |-- PLAYER_NAME: string (nullable = true)
 |-- TEAM_ABBREVIATION: string (nullable = true)
 |-- AGE: string (nullable = true)
 |-- GP: string (nullable = true)
 |-- W: string (nullable = true)
 |-- L: string (nullable = true)
 |-- MIN: string (nullable = true)
 |-- PTS: string (nullable = true)
 |-- FGM: string (nullable = true)
 |-- FGA: string (nullable = true)
 |-- FG_PCT: string (nullable = true)
 |-- FG3M: string (nullable = true)
 |-- FG3A: string (nullable = true)
 |-- FG3_PCT: string (nullable = true)
 |-- FTM: string (nullable = true)
 |-- FTA: string (nullable = true)
 |-- FT_PCT: string (nullable = true)
 |-- OREB: string (nullable = true)
 |-- DREB: string (nullable = true)
 |-- REB: string (nullable = true)
 |-- AST: string (nullable = true)
 |-- TOV: string (nullable = true)
 |-- STL: string (nullable = true)
 |-- BLK: string (nullable = true)
 |-- PF: string (nullable = true)
 |-- NBA_FANTASY_PTS: string (nullable = true)
 |-- 

### Select data and define data types

In [16]:
cleaned_data = spark_df.select(
    spark_df['SEASON'].cast('int'),
    spark_df['PLAYER_NAME'],
    spark_df['AGE'].cast('int'),
    spark_df['W'].cast('int'),
    spark_df['L'].cast('int'),
    spark_df['DD2'].cast('double'),
    spark_df['TD3'].cast('double'),
    spark_df['PLUS_MINUS'].cast('double')
)

### Convert data into a dictionary format

In [17]:
records = cleaned_data.toPandas().to_dict('records')

### Iteration through the dataset and insert it into the MongoDB collection

In [18]:
for record in records:
    existing_doc = collection.find_one({
        'SEASON': record['SEASON'],
        'PLAYER_NAME': record['PLAYER_NAME'],
    })

    if existing_doc is None:
        collection.insert_one(record)
        print(f"Inserted {record['PLAYER_NAME']} - Season: {record['SEASON']}")
    else:
        print("Skipped")

Inserted Michael Jordan - Season: 1996
Inserted Karl Malone - Season: 1996
Inserted Glen Rice - Season: 1996
Inserted Shaquille O'Neal - Season: 1996
Inserted Mitch Richmond - Season: 1996
Inserted Latrell Sprewell - Season: 1996
Inserted Allen Iverson - Season: 1996
Inserted Hakeem Olajuwon - Season: 1996
Inserted Patrick Ewing - Season: 1996
Inserted LaPhonso Ellis - Season: 1996
Inserted Kendall Gill - Season: 1996
Inserted Gary Payton - Season: 1996
Inserted Reggie Miller - Season: 1996
Inserted Grant Hill - Season: 1996
Inserted Glenn Robinson - Season: 1996
Inserted Vin Baker - Season: 1996
Inserted Jerry Stackhouse - Season: 1996
Inserted Tom Gugliotta - Season: 1996
Inserted Anfernee Hardaway - Season: 1996
Inserted Tim Hardaway - Season: 1996
Inserted Scottie Pippen - Season: 1996
Inserted Damon Stoudamire - Season: 1996
Inserted Kevin Johnson - Season: 1996
Inserted Chris Webber - Season: 1996
Inserted Steven Smith - Season: 1996
Inserted Alonzo Mourning - Season: 1996
Insert