# Assignment 3 - (Hadoop)

1. Write a Python program to read a Hadoop configuration file and display the core components of Hadoop.

In [None]:
import configparser

# Read the Hadoop configuration file
config = configparser.ConfigParser()
config.read('/path/to/hadoop/conf/hadoop-env.sh')

# Retrieve the core components from the configuration file
core_components = config.get('core-site', 'fs.defaultFS')

# Display the core components
print("Core Components of Hadoop:")
print(core_components)

2. Implement a Python function that calculates the total file size in a Hadoop Distributed File System (HDFS) directory.


In [None]:
from hdfs import InsecureClient

def calculate_total_size(hdfs_url, directory):
    # Create an HDFS client
    client = InsecureClient(hdfs_url)

    # Get a list of files in the directory
    file_list = client.list(directory, status=True)

    # Calculate the total file size
    total_size = 0
    for file in file_list:
        total_size += file['length']

    return total_size


# Example usage:
hdfs_url = 'http://localhost:9870'
directory = '/path/to/hdfs/directory'

total_size = calculate_total_size(hdfs_url, directory)
print(f"Total file size in HDFS directory: {total_size} bytes")


3. Create a Python program that extracts and displays the top N most frequent words from a large text file using the MapReduce approach.


In [3]:
from collections import Counter

# Specify the path to the input file
input_file = 'file.txt'

# Read the input file and split it into words
with open(input_file, 'r') as file:
    words = file.read().lower().split()

# Count the frequency of each word
word_counts = Counter(words)

# Specify the value of N for the top N words
top_n = 10

# Get the top N most frequent words
top_words = word_counts.most_common(top_n)

# Display the top N words and their frequencies
print("Top N Most Frequent Words:")
for word, count in top_words:
    print(f"{word}: {count}")


Top N Most Frequent Words:
how: 2
are: 2
you: 2
hello: 1
doing: 1


4. Write a Python script that checks the health status of the NameNode and DataNodes in a Hadoop cluster using Hadoop's REST API.


In [None]:
import requests

# Specify the URL of the Hadoop NameNode's web UI
namenode_url = 'http://<namenode-host>:<namenode-port>/dfshealth.html'

# Specify the URL of the Hadoop DataNode's web UI
datanode_url = 'http://<datanode-host>:<datanode-port>/databrowser.html'

def check_namenode_health():
    response = requests.get(namenode_url)
    if response.status_code == 200:
        print("NameNode is healthy")
    else:
        print("NameNode is not healthy")

def check_datanode_health():
    response = requests.get(datanode_url)
    if response.status_code == 200:
        print("DataNode is healthy")
    else:
        print("DataNode is not healthy")

# Example usage:
check_namenode_health()
check_datanode_health()


5. Develop a Python program that lists all the files and directories in a specific HDFS path.


In [None]:
from hdfs import InsecureClient

def list_hdfs_path(hdfs_url, hdfs_path):
    client = InsecureClient(hdfs_url)
    files = client.list(hdfs_path, status=True)
    for file in files:
        print(file['path'])

# Example usage:
hdfs_url = 'http://localhost:9870'
hdfs_path = '/path/to/hdfs/directory'

list_hdfs_path(hdfs_url, hdfs_path)


6. Implement a Python program that analyzes the storage utilization of DataNodes in a Hadoop cluster and identifies the nodes with the highest and lowest storage capacities.


In [None]:
import requests

# Specify the URL of the Hadoop Namenode's web UI
namenode_url = 'http://<namenode-host>:<namenode-port>'

def get_datanodes_storage_utilization():
    response = requests.get(f"{namenode_url}/dfshealth.html#tab-datanode")
    if response.status_code == 200:
        datanodes_info = response.text
        # Parse the response to extract storage utilization information
        # and calculate the highest and lowest storage capacities
        # Return the highest and lowest storage capacities

def analyze_storage_utilization():
    storage_utilization = get_datanodes_storage_utilization()

    if storage_utilization:
        highest_capacity_node = max(storage_utilization, key=storage_utilization.get)
        lowest_capacity_node = min(storage_utilization, key=storage_utilization.get)

        print("Node with the highest storage capacity:")
        print("Node:", highest_capacity_node)
        print("Capacity:", storage_utilization[highest_capacity_node])

        print("Node with the lowest storage capacity:")
        print("Node:", lowest_capacity_node)
        print("Capacity:", storage_utilization[lowest_capacity_node])
    else:
        print("Failed to analyze storage utilization")

# Example usage:
analyze_storage_utilization()


7. Create a Python script that interacts with YARN's ResourceManager API to submit a Hadoop job, monitor its progress, and retrieve the final output.

In [None]:
import requests
import time

# Specify the URL of the YARN ResourceManager's web UI
resourcemanager_url = 'http://<resourcemanager-host>:<resourcemanager-port>'

def submit_hadoop_job(job_file):
    headers = {'Content-Type': 'application/json'}
    data = {
        'application-id': '',
        'application-name': 'MyHadoopJob',
        'am-container-spec': {
            'local-resources': {
                'file': {
                    'resource': job_file,
                    'type': 'FILE',
                    'visibility': 'APPLICATION',
                }
            },
            'commands': {
                'command': 'hadoop jar ' + job_file,
            },
            'environment': {},
        },
        'application-type': 'MAPREDUCE',
        'resource': {
            'memory': '1024',
            'vCores': '1',
        },
    }

    response = requests.post(f"{resourcemanager_url}/ws/v1/cluster/apps/new-application", headers=headers)
    if response.status_code == 200:
        application_id = response.json()['application-id']
        data['application-id'] = application_id

        response = requests.post(f"{resourcemanager_url}/ws/v1/cluster/apps", headers=headers, json=data)
        if response.status_code == 202:
            print("Hadoop job submitted successfully")
            return application_id

    print("Failed to submit Hadoop job")
    return None

def monitor_job_progress(application_id):
    headers = {'Content-Type': 'application/json'}
    while True:
        response = requests.get(f"{resourcemanager_url}/ws/v1/cluster/apps/{application_id}", headers=headers)
        if response.status_code == 200:
            status = response.json()['app']['state']
            if status in ['FINISHED', 'FAILED', 'KILLED']:
                break
            print("Job Status:", status)
        else:
            print("Failed to get job status")
            break
        time.sleep(5)

def retrieve_job_output(application_id):
    headers = {'Content-Type': 'application/json'}
    response = requests.get(f"{resourcemanager_url}/proxy/{application_id}/ws/v1/mapreduce/jobs/{application_id}/jobattempts", headers=headers)
    if response.status_code == 200:
        job_attempts = response.json()['jobAttempts']
        if len(job_attempts) > 0:
            last_attempt_id = job_attempts[-1]['id']
            response = requests.get(f"{resourcemanager_url}/proxy/{application_id}/ws/v1/mapreduce/jobs/{application_id}/jobattempts/{last_attempt_id}/counters", headers=headers)
            if response.status_code == 200:
                counters = response.json()['jobCounters']['counterGroup'][0]['counter']
                for counter in counters:
                    if counter['name'] == 'FILE_BYTES_READ':
                        print("Final Output Size:", counter['totalCounterValue'])
                        break
            else:
                print("Failed to retrieve job counters")
        else:
            print("No job attempts found")
    else:
        print("Failed to retrieve job attempts")

# Example usage:
job_file = '/path/to/hadoop/job.jar'

application_id = submit_hadoop_job(job_file)
if application_id:
    monitor_job_progress(application_id)
    retrieve_job_output(application_id)


8. Create a Python script that interacts with YARN's ResourceManager API to submit a Hadoop job, set resource requirements, and track resource usage during job execution.


In [None]:
import requests
import time

# Specify the URL of the YARN ResourceManager's web UI
resourcemanager_url = 'http://<resourcemanager-host>:<resourcemanager-port>'

def submit_hadoop_job(job_file, memory_mb, vcores):
    headers = {'Content-Type': 'application/json'}
    data = {
        'application-id': '',
        'application-name': 'MyHadoopJob',
        'am-container-spec': {
            'local-resources': {
                'file': {
                    'resource': job_file,
                    'type': 'FILE',
                    'visibility': 'APPLICATION',
                }
            },
            'commands': {
                'command': 'hadoop jar ' + job_file,
            },
            'environment': {},
        },
        'application-type': 'MAPREDUCE',
        'resource': {
            'memory': str(memory_mb),
            'vCores': str(vcores),
        },
    }

    response = requests.post(f"{resourcemanager_url}/ws/v1/cluster/apps/new-application", headers=headers)
    if response.status_code == 200:
        application_id = response.json()['application-id']
        data['application-id'] = application_id

        response = requests.post(f"{resourcemanager_url}/ws/v1/cluster/apps", headers=headers, json=data)
        if response.status_code == 202:
            print("Hadoop job submitted successfully")
            return application_id

    print("Failed to submit Hadoop job")
    return None

def track_resource_usage(application_id):
    headers = {'Content-Type': 'application/json'}
    while True:
        response = requests.get(f"{resourcemanager_url}/ws/v1/cluster/apps/{application_id}/appattempts", headers=headers)
        if response.status_code == 200:
            app_attempts = response.json()['appAttempts']
            if len(app_attempts) > 0:
                last_attempt_id = app_attempts[-1]['appAttemptId']
                response = requests.get(f"{resourcemanager_url}/ws/v1/cluster/apps/{application_id}/appattempts/{last_attempt_id}/containers", headers=headers)
                if response.status_code == 200:
                    containers = response.json()['containers']
                    for container in containers:
                        print("Container ID:", container['containerId'])
                        print("Allocated Memory:", container['allocatedMB'])
                        print("Allocated vCores:", container['allocatedVCores'])
                        print("")

                        # You can perform further analysis or processing of the container information here

                    time.sleep(5)
                else:
                    print("Failed to retrieve containers")
                    break
            else:
                print("No app attempts found")
                break
        else:
            print("Failed to get app attempts")
            break

# Example usage:
job_file = '/path/to/hadoop/job.jar'
memory_mb = 1024
vcores = 1

application_id = submit_hadoop_job(job_file, memory_mb, vcores)
if application_id:
    track_resource_usage(application_id)


9. Write a Python program that compares the performance of a MapReduce job with different input split sizes, showcasing the impact on overall job execution time.


In [None]:
import subprocess
import time

# Specify the path to the Hadoop Streaming JAR file
hadoop_streaming_jar = "/path/to/hadoop-streaming.jar"

# Specify the Hadoop input directory and output directory
input_dir = "/path/to/input"
output_dir = "/path/to/output"

# Specify the different input split sizes to compare
input_split_sizes = ["64", "128", "256"]

def run_mapreduce_job(input_split_size):
    start_time = time.time()
    
    # Run the Hadoop Streaming job with the specified input split size
    cmd = [
        "hadoop",
        "jar",
        hadoop_streaming_jar,
        "-input",
        input_dir,
        "-output",
        output_dir,
        "-mapper",
        "mapper.py",
        "-reducer",
        "reducer.py",
        "-inputformat",
        "org.apache.hadoop.mapred.TextInputFormat",
        "-jobconf",
        "mapreduce.input.fileinputformat.split.minsize=" + input_split_size + "M",
        "-jobconf",
        "mapreduce.input.fileinputformat.split.maxsize=" + input_split_size + "M"
    ]
    
    subprocess.call(cmd)
    
    end_time = time.time()
    execution_time = end_time - start_time
    
    return execution_time

# Example usage:
mapper_script = "mapper.py"
reducer_script = "reducer.py"

for split_size in input_split_sizes:
    # Prepare and upload the mapper and reducer scripts to Hadoop cluster
    subprocess.call(["hadoop", "fs", "-put", mapper_script, input_dir])
    subprocess.call(["hadoop", "fs", "-put", reducer_script, input_dir])
    
    execution_time = run_mapreduce_job(split_size)
    
    print(f"Execution Time with Input Split Size {split_size} MB: {execution_time} seconds")
