In [3]:
!pip install sqlalchemy scipy

Collecting sqlalchemy
  Using cached SQLAlchemy-2.0.31-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
Installing collected packages: sqlalchemy
Successfully installed sqlalchemy-2.0.31

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m24.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
import logging
import requests
import pandas as pd
from kafka import KafkaProducer
import json
import psycopg2
from sqlalchemy import create_engine
from scipy.special import boxcox
import numpy as np
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


In [5]:
def zscore_normalization(df, name):
    mean = df[name].mean()
    sd = df[name].std()

    df[name] = (df[name] - mean) / sd

def preprocess(df):
    df = df.drop(columns=['Name', 'md5'])
    for i in df.columns:
        if i != 'legitimate':
            df[i] = boxcox(df[i], 0.5)
            zscore_normalization(df, i)
    correlation_matrix = df.corr()
    cols_to_drop = []
    for i in df.columns:
        for j in df.columns:
            if i != j and i != 'legitimate' and j != 'legitimate' and abs(correlation_matrix[i][j]) > 0.6 and i not in cols_to_drop and j not in cols_to_drop:
                cols_to_drop.append(i)
    cols_to_drop = set(cols_to_drop)
    df.drop(columns=cols_to_drop, inplace=True)
    return df

In [6]:
import pandas as pd
import requests
import json
import logging
from kafka import KafkaProducer
from sqlalchemy import create_engine

# Initialize logger
logger = logging.getLogger()
logger.setLevel(logging.INFO)

# Broker address
brokers = ["ec2-54-80-241-235.compute-1.amazonaws.com:9092"]

# GitHub raw URL for the CSV file
github_url = "https://raw.githubusercontent.com/tsimhadri-ews/internproject/malware-detection-0/src/MalwareData.csv"

# PostgreSQL database connection details
db_config = {
    'dbname': 'mydb',
    'user': 'exampleuser',
    'password': 'abc',
    'host': '10.100.107.105',
    'port': 5432
}

# Kafka topics
postgres_topic = "postgresql"
kserve_topic = "three"

# Download the CSV file from GitHub
logger.info("Downloading CSV file from GitHub...")
response = requests.get(github_url)

# Check if the request was successful
if response.status_code == 200:
    logger.info("CSV file downloaded successfully.")
    # Read the data into a pandas DataFrame
    df = pd.read_csv(github_url, sep='|')
    df = preprocess(df)
    df = df.drop(columns=['legitimate'])
    logger.info("CSV file processed successfully.")
else:
    logger.error(f"Failed to download file: {response.status_code}")
    exit()

# Create Kafka producers
logger.info("Creating Kafka producers...")
postgres_producer = KafkaProducer(
    bootstrap_servers=brokers,
    value_serializer=lambda message: json.dumps({k: str(v).lower() for k, v in message.items()}).encode('utf-8'),
)

kserve_producer = KafkaProducer(
    bootstrap_servers=brokers,
    value_serializer=lambda message: json.dumps(message).encode('utf-8'),
)

# PostgreSQL connection setup
logger.info("Connecting to PostgreSQL...")
try:W
    engine = create_engine(f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['dbname']}")
    logger.info("PostgreSQL connection established successfully.")
except Exception as e:
    logger.error(f"Failed to connect to PostgreSQL: {e}")
    exit()

# Send data to KServe and PostgreSQL
first_5_rows = df.head(5)
for index, row in first_5_rows.iterrows():
    data = row.to_dict()
    
    # Send to PostgreSQL topic
    postgres_producer.send(postgres_topic, data)
    logger.info(f"Sent row {index + 1} to PostgreSQL Kafka topic")
    
    # Send to KServe topic
    kserve_producer.send(kserve_topic, data)
    logger.info(f"Sent row {index + 1} to KServe Kafka topic")
    
    # Insert into PostgreSQL
    try:
        row_df = pd.DataFrame([data])
        row_df.to_sql('your_table_name', engine, if_exists='append', index=False)
        logger.info(f"Row {index + 1} inserted into PostgreSQL successfully.")
    except Exception as e:
        logger.error(f"Failed to insert row {index + 1} into PostgreSQL: {e}")

# Flush data and close the producers
postgres_producer.flush()
postgres_producer.close()
kserve_producer.flush()
kserve_producer.close()
logger.info(f"First 5 rows sent to PostgreSQL Kafka topic: {postgres_topic} and KServe Kafka topic: {kserve_topic}")


INFO:root:Downloading CSV file from GitHub...
INFO:root:CSV file downloaded successfully.
INFO:root:CSV file processed successfully.
INFO:root:Creating Kafka producers...
INFO:kafka.conn:<BrokerConnection node_id=bootstrap-0 host=ec2-54-80-241-235.compute-1.amazonaws.com:9092 <connecting> [IPv4 ('54.80.241.235', 9092)]>: connecting to ec2-54-80-241-235.compute-1.amazonaws.com:9092 [('54.80.241.235', 9092) IPv4]
INFO:kafka.conn:Probing node bootstrap-0 broker version
INFO:kafka.conn:<BrokerConnection node_id=bootstrap-0 host=ec2-54-80-241-235.compute-1.amazonaws.com:9092 <connecting> [IPv4 ('54.80.241.235', 9092)]>: Connection complete.
INFO:kafka.conn:Broker version identified as 2.5.0
INFO:kafka.conn:Set configuration api_version=(2, 5, 0) to skip auto check_version requests on startup
INFO:kafka.conn:<BrokerConnection node_id=bootstrap-0 host=ec2-54-80-241-235.compute-1.amazonaws.com:9092 <connecting> [IPv4 ('54.80.241.235', 9092)]>: connecting to ec2-54-80-241-235.compute-1.amazonaw