# Producer for Stream Processing

## Installations and Imports

In [1]:
from scipy.special import boxcox
import requests
from kafka import KafkaProducer
import pandas as pd
import json
import time
import logging


## Helper Functions for Preprocessing

In [2]:
def zscore_normalization(df, name):
    mean = df[name].mean()
    sd = df[name].std()

    df[name] = (df[name] - mean) / sd

def preprocess(df):
    df = df.drop(columns=['Name', 'md5'])
    for i in df.columns:
        if i != 'legitimate':
            df[i] = boxcox(df[i], 0.5)
            zscore_normalization(df, i)
    correlation_matrix = df.corr()
    cols_to_drop = []
    for i in df.columns:
        for j in df.columns:
            if i != j and i != 'legitimate' and j != 'legitimate' and abs(correlation_matrix[i][j]) > 0.6 and i not in cols_to_drop and j not in cols_to_drop:
                cols_to_drop.append(i)
    cols_to_drop = set(cols_to_drop)
    df.drop(columns=cols_to_drop, inplace=True)
    return df

## Producer

### In this section the producer is created under the necessary broker, the data is extracted and preprocessed, serialized, and sent to the consumer.

In [3]:

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Broker address
brokers = ["*:9092"]

# GitHub raw URL for the CSV file
github_url = "https://raw.githubusercontent.com/tsimhadri-ews/internproject/malware-detection-0/src/MalwareData.csv"


# Download the CSV file from GitHub
response = requests.get(github_url)

# Check if the request was successful
if response.status_code == 200:
    # Read the data into a pandas DataFrame
    df = pd.read_csv(github_url, sep='|')
    df = preprocess(df)
    df = df.drop(columns=['legitimate'])
    df = df.head(5)
    logger.info("CSV file successfully downloaded and processed")
else:
    logger.error(f"Failed to download file: {response.status_code}")
    exit()

# Replace with your topic name
topic = "testTopic"

# Create a Kafka producer
producer = KafkaProducer(
    bootstrap_servers=brokers,
    value_serializer=lambda message: json.dumps(message).encode('utf-8'),
)
logger.info("Kafka producer created successfully")

# Function to send data to Kafka in batches
def stream_data_to_kafka(dataframe, batch_size, sleep_time):
    num_batches = len(dataframe) // batch_size + (1 if len(dataframe) % batch_size != 0 else 0)
    logger.info(f"Streaming data in {num_batches} batches of {batch_size} rows each")
    
    for i in range(num_batches):
        start_index = i * batch_size
        end_index = min((i + 1) * batch_size, len(dataframe))
        
        batch = dataframe.iloc[start_index:end_index]
        
        for index, row in batch.iterrows():
            data = row.to_dict()
            producer.send(topic, data)
            logger.info(f"Sent row {index + 1} to Kafka")
        
        # Flush data to Kafka
        producer.flush()
        
        # Sleep between batches
        time.sleep(sleep_time)
        logger.info(f"Batch {i + 1}/{num_batches} sent. Sleeping for {sleep_time} seconds.")
    
    # Close the producer after all batches are sent
    producer.close()
    logger.info("Kafka producer closed")

# Stream the data in batches of 5 rows with a 5-second interval between batches
batch_size = 3
sleep_time = 5  # seconds

stream_data_to_kafka(df, batch_size, sleep_time)

logger.info(f"Data streamed to Kafka topic: {topic} in batches of {batch_size}")


INFO:__main__:CSV file successfully downloaded and processed
INFO:kafka.conn:<BrokerConnection node_id=bootstrap-0 host=ec2-184-73-88-208.compute-1.amazonaws.com:9092 <connecting> [IPv4 ('184.73.88.208', 9092)]>: connecting to ec2-184-73-88-208.compute-1.amazonaws.com:9092 [('184.73.88.208', 9092) IPv4]
INFO:kafka.conn:Probing node bootstrap-0 broker version
INFO:kafka.conn:<BrokerConnection node_id=bootstrap-0 host=ec2-184-73-88-208.compute-1.amazonaws.com:9092 <connecting> [IPv4 ('184.73.88.208', 9092)]>: Connection complete.
INFO:kafka.conn:Broker version identified as 2.5.0
INFO:kafka.conn:Set configuration api_version=(2, 5, 0) to skip auto check_version requests on startup
INFO:__main__:Kafka producer created successfully
INFO:__main__:Streaming data in 2 batches of 3 rows each
INFO:__main__:Sent row 1 to Kafka
INFO:__main__:Sent row 2 to Kafka
INFO:__main__:Sent row 3 to Kafka
INFO:kafka.conn:<BrokerConnection node_id=1 host=ec2-184-73-88-208.compute-1.amazonaws.com:9093 <connec