1. Write a Python program to read a Hadoop configuration file and display the core components of Hadoop.

In [None]:
import configparser

def display_core_components():
    # Create a ConfigParser object
    config = configparser.ConfigParser()

    # Read the Hadoop configuration file
    config.read('/path/to/hadoop/conf/core-site.xml')

    # Get the value of the property "fs.defaultFS"
    default_fs = config.get('core-site', 'fs.defaultFS')

    # Get the value of the property "mapreduce.framework.name"
    framework_name = config.get('core-site', 'mapreduce.framework.name')

    # Get the value of the property "yarn.resourcemanager.address"
    resource_manager_address = config.get('yarn-site', 'yarn.resourcemanager.address')

    # Display the core components
    print('Default File System:', default_fs)
    print('MapReduce Framework:', framework_name)
    print('Resource Manager Address:', resource_manager_address)

# Call the function to display the core components
display_core_components()


2. Implement a Python function that calculates the total file size in a Hadoop Distributed File System (HDFS) directory.

In [None]:
from hdfs import InsecureClient

def calculate_total_file_size(hdfs_url, hdfs_directory):
    # Create an HDFS client
    client = InsecureClient(hdfs_url)

    # Get the file status of the directory
    dir_status = client.status(hdfs_directory)

    # Initialize total file size
    total_size = 0

    # Recursively traverse the directory
    for root, dirs, files in client.walk(hdfs_directory):
        # Calculate the size of each file and add it to the total
        for file in files:
            file_path = f"{root}/{file}"
            file_status = client.status(file_path)
            total_size += file_status['length']

    # Convert total_size to human-readable format
    total_size_mb = total_size / (1024 * 1024)

    # Return the total file size
    return total_size_mb

# Example usage
hdfs_url = 'http://localhost:9870'  # Replace with your HDFS URL
hdfs_directory = '/user/example'  # Replace with your HDFS directory
total_size = calculate_total_file_size(hdfs_url, hdfs_directory)
print(f"Total File Size: {total_size:.2f} MB")


3. Create a Python program that extracts and displays the top N most frequent words from a large text file using the MapReduce approach.

In [None]:
from mrjob.job import MRJob
from mrjob.step import MRStep
import re

class MRWordFrequency(MRJob):
    
    def mapper_get_words(self, _, line):
        # Split the line into words using regular expression
        words = re.findall(r'\w+', line.lower())
        
        # Emit each word as a key with count as 1
        for word in words:
            yield word, 1
    
    def combiner_count_words(self, word, counts):
        # Sum the counts of each word emitted by the mapper
        yield word, sum(counts)
    
    def reducer_count_words(self, word, counts):
        # Sum the counts of each word emitted by the combiner
        yield word, sum(counts)
    
    def mapper_sort_words(self, word, count):
        # Swap the key-value pairs for sorting
        yield None, (count, word)
    
    def reducer_sort_words(self, _, count_word_pairs):
        # Sort the word-count pairs in descending order of count
        sorted_pairs = sorted(count_word_pairs, reverse=True)
        
        # Yield the top N word-count pairs
        for i in range(N):
            count, word = sorted_pairs[i]
            yield word, count
    
    def steps(self):
        return [
            MRStep(mapper=self.mapper_get_words,
                   combiner=self.combiner_count_words,
                   reducer=self.reducer_count_words),
            MRStep(mapper=self.mapper_sort_words,
                   reducer=self.reducer_sort_words)
        ]

if __name__ == '__main__':
    N = 10  # Number of top words to display
    input_file = 'path/to/large/text/file.txt'  # Replace with the path to your large text file
    
    # Run the MapReduce job
    MRWordFrequency.run(['-r', 'local', input_file])


4. Write a Python script that checks the health status of the NameNode and DataNodes in a Hadoop cluster using Hadoop's REST API.

In [None]:
import requests
import json

def check_namenode_health(namenode_url):
    # Send a GET request to the NameNode's JMX endpoint
    jmx_url = f"{namenode_url}/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo"
    response = requests.get(jmx_url)
    data = response.json()

    # Check the health status of the NameNode
    if "beans" in data:
        for bean in data["beans"]:
            if "NNRole" in bean and bean["NNRole"] == "active":
                return True

    return False

def check_datanode_health(datanode_url):
    # Send a GET request to the DataNode's JMX endpoint
    jmx_url = f"{datanode_url}/jmx?qry=Hadoop:service=DataNode,name=FSDatasetState"
    response = requests.get(jmx_url)
    data = response.json()

    # Check the health status of the DataNode
    if "beans" in data:
        for bean in data["beans"]:
            if "VolumeFailuresTotal" in bean and bean["VolumeFailuresTotal"] == 0:
                return True

    return False

if __name__ == "__main__":
    namenode_url = "http://<namenode-hostname>:50070"  # Replace with the URL of your NameNode
    datanode_urls = [
        "http://<datanode1-hostname>:50075",  # Replace with the URLs of your DataNodes
        "http://<datanode2-hostname>:50075",
        "http://<datanode3-hostname>:50075"
    ]

    # Check the health status of the NameNode
    namenode_health = check_namenode_health(namenode_url)
    if namenode_health:
        print("NameNode is healthy")
    else:
        print("NameNode is not healthy")

    # Check the health status of each DataNode
    for datanode_url in datanode_urls:
        datanode_health = check_datanode_health(datanode_url)
        if datanode_health:
            print(f"DataNode at {datanode_url} is healthy")
        else:
            print(f"DataNode at {datanode_url} is not healthy")


5. Develop a Python program that lists all the files and directories in a specific HDFS path.

In [None]:
from hdfs import InsecureClient

def list_hdfs_path(hdfs_url, hdfs_path):
    # Create an HDFS client
    client = InsecureClient(hdfs_url)

    # List all files and directories in the HDFS path
    contents = client.list(hdfs_path, status=True)

    # Print the list of files and directories
    for item in contents:
        if item['type'] == 'DIRECTORY':
            print('Directory:', item['path'])
        else:
            print('File:', item['path'])

# Example usage
hdfs_url = 'http://localhost:9870'  # Replace with your HDFS URL
hdfs_path = '/user/example'  # Replace with the desired HDFS path
list_hdfs_path(hdfs_url, hdfs_path)


6. Implement a Python program that analyzes the storage utilization of DataNodes in a Hadoop cluster and identifies the nodes with the highest and lowest storage capacities.

In [None]:
import requests
import json

def get_datanode_info(hadoop_url):
    # Send a GET request to the DataNodes endpoint
    datanodes_url = f"{hadoop_url}/jmx?qry=Hadoop:service=DataNode,name=DataNodeInfo"
    response = requests.get(datanodes_url)
    data = response.json()

    # Extract the DataNode information
    if "beans" in data:
        datanodes = data["beans"]
        return datanodes

    return []

def analyze_datanode_storage_utilization(hadoop_url):
    # Retrieve the DataNode information
    datanodes = get_datanode_info(hadoop_url)

    if len(datanodes) > 0:
        # Sort the DataNodes based on their storage capacities
        sorted_datanodes = sorted(datanodes, key=lambda d: d["Capacity"], reverse=True)

        # Extract the DataNode with the highest storage capacity
        highest_capacity_datanode = sorted_datanodes[0]
        highest_capacity_hostname = highest_capacity_datanode["HostAndPort"]

        # Extract the DataNode with the lowest storage capacity
        lowest_capacity_datanode = sorted_datanodes[-1]
        lowest_capacity_hostname = lowest_capacity_datanode["HostAndPort"]

        # Display the results
        print(f"Highest storage capacity: {highest_capacity_hostname}")
        print(f"Lowest storage capacity: {lowest_capacity_hostname}")

    else:
        print("No DataNode information found.")

if __name__ == "__main__":
    hadoop_url = "http://<namenode-hostname>:50070"  # Replace with the URL of your NameNode

    # Analyze the storage utilization of DataNodes
    analyze_datanode_storage_utilization(hadoop_url)


7. Create a Python script that interacts with YARN's ResourceManager API to submit a Hadoop job, monitor its progress, and retrieve the final output.

In [None]:
import requests
import time
import json

def submit_hadoop_job(resource_manager_url, job_parameters):
    # Construct the URL for submitting the Hadoop job
    submit_url = f"{resource_manager_url}/ws/v1/cluster/apps/new-application"
    
    # Send a POST request to submit the Hadoop job
    response = requests.post(submit_url)
    data = response.json()
    
    # Extract the application ID from the response
    application_id = data["application-id"]

    # Construct the URL for submitting the job request
    submit_job_url = f"{resource_manager_url}/ws/v1/cluster/apps/{application_id}/app"
    
    # Set the job parameters in the request payload
    payload = {
        "application-id": application_id,
        "application-name": "My Hadoop Job",
        "am-container-spec": {
            "commands": {
                "command": job_parameters
            },
            "local-resources": {}
        },
        "application-type": "MAPREDUCE"
    }

    # Send a PUT request to submit the job request
    response = requests.put(submit_job_url, json=payload)
    data = response.json()
    
    # Extract the job ID from the response
    job_id = data["application-id"]
    
    return job_id

def monitor_job_progress(resource_manager_url, job_id):
    # Construct the URL for monitoring the job progress
    job_status_url = f"{resource_manager_url}/ws/v1/cluster/apps/{job_id}"
    
    while True:
        # Send a GET request to get the job status
        response = requests.get(job_status_url)
        data = response.json()
        
        # Extract the job state from the response
        state = data["app"]["state"]
        
        # Display the job state
        print(f"Job ID: {job_id}, State: {state}")
        
        # Check if the job is in a final state
        if state in ["FINISHED", "FAILED", "KILLED"]:
            break
        
        # Wait for 5 seconds before checking the status again
        time.sleep(5)

def retrieve_job_output(resource_manager_url, job_id):
    # Construct the URL for retrieving the job output
    job_output_url = f"{resource_manager_url}/proxy/{job_id}/ws/v1/mapreduce/jobs/{job_id}/jobattempts"
    
    # Send a GET request to retrieve the job output
    response = requests.get(job_output_url)
    data = response.json()
    
    # Extract the final output from the response
    output = data["jobAttempts"]["jobAttempt"][0]["assignedContainerLogs"]["containerLogFiles"][0]["file"]
    
    return output

if __name__ == "__main__":
    resource_manager_url = "http://localhost:8088"  # Replace with the URL of your ResourceManager
    job_parameters = ["hadoop", "jar", "/path/to/hadoop-job.jar", "input", "output"]  # Replace with your job parameters
    
    # Submit the Hadoop job and retrieve the job ID
    job_id = submit_hadoop_job(resource_manager_url, job_parameters)
    print(f"Submitted Hadoop job. Job ID: {job_id}")
    
    # Monitor the job progress
    monitor_job_progress(resource_manager_url, job_id)
    
    # Retrieve the final output
    final_output = retrieve_job_output(resource_manager_url, job_id)
    print(f"Final output: {final_output}")


8. Create a Python script that interacts with YARN's ResourceManager API to submit a Hadoop job, set resource requirements, and track resource usage during job execution.

In [None]:
import requests
import time
import json

def submit_hadoop_job(resource_manager_url, job_parameters, memory_mb, vcores):
    # Construct the URL for submitting the Hadoop job
    submit_url = f"{resource_manager_url}/ws/v1/cluster/apps/new-application"

    # Send a POST request to submit the Hadoop job
    response = requests.post(submit_url)
    data = response.json()

    # Extract the application ID from the response
    application_id = data["application-id"]

    # Construct the URL for submitting the job request
    submit_job_url = f"{resource_manager_url}/ws/v1/cluster/apps/{application_id}/app"

    # Set the job parameters and resource requirements in the request payload
    payload = {
        "application-id": application_id,
        "application-name": "My Hadoop Job",
        "am-container-spec": {
            "commands": {
                "command": job_parameters
            },
            "local-resources": {}
        },
        "application-type": "MAPREDUCE",
        "resource": {
            "memory": memory_mb,
            "vCores": vcores
        }
    }

    # Send a PUT request to submit the job request
    response = requests.put(submit_job_url, json=payload)
    data = response.json()

    # Extract the job ID from the response
    job_id = data["application-id"]

    return job_id

def monitor_resource_usage(resource_manager_url, job_id):
    # Construct the URL for monitoring the resource usage
    resource_usage_url = f"{resource_manager_url}/ws/v1/cluster/apps/{job_id}/appattempts"

    while True:
        # Send a GET request to get the resource usage
        response = requests.get(resource_usage_url)
        data = response.json()

        # Extract the resource usage details
        resource_attempts = data["appAttempts"]["appAttempt"]
        if len(resource_attempts) > 0:
            latest_attempt = resource_attempts[-1]
            allocated_memory_mb = latest_attempt["allocatedMB"]
            allocated_vcores = latest_attempt["allocatedVCores"]
            state = latest_attempt["appAttemptState"]

            # Display the resource usage details
            print(f"Job ID: {job_id}, State: {state}, Allocated Memory: {allocated_memory_mb} MB, Allocated vCores: {allocated_vcores}")

            # Check if the job is in a final state
            if state in ["FINISHED", "FAILED", "KILLED"]:
                break

        # Wait for 5 seconds before checking the status again
        time.sleep(5)

if __name__ == "__main__":
    resource_manager_url = "http://localhost:8088"  # Replace with the URL of your ResourceManager
    job_parameters = ["hadoop", "jar", "/path/to/hadoop-job.jar", "input", "output"]  # Replace with your job parameters
    memory_mb = 4096  # Set the memory requirements for the job in megabytes
    vcores = 2  # Set the number of virtual cores required for the job

    # Submit the Hadoop job and retrieve the job ID
    job_id = submit_hadoop_job(resource_manager_url, job_parameters, memory_mb, vcores)
    print(f"Submitted Hadoop job. Job ID: {job_id}")

    # Monitor the resource usage of the job
    monitor_resource_usage(resource_manager_url, job_id)


9. Write a Python program that compares the performance of a MapReduce job with different input split sizes, showcasing the impact on overall job execution time.

In [None]:
from mrjob.job import MRJob
import time

class MRInputSplitSize(MRJob):
    
    def configure_args(self):
        super(MRInputSplitSize, self).configure_args()
        self.add_passthru_arg('--split-size', default=64,
                              help='Input split size in megabytes')

    def mapper(self, _, line):
        # No actual mapping is performed for this example
        pass
    
    def reducer(self, key, values):
        # No actual reducing is performed for this example
        pass
    
    def mapper_final(self):
        # Emit the current timestamp as the mapper's final output
        yield None, time.time()
    
    def reducer_final(self):
        # Calculate and yield the execution time
        start_time = min(self.values)
        end_time = max(self.values)
        execution_time = end_time - start_time
        yield 'Execution Time', execution_time

if __name__ == '__main__':
    # List of input split sizes to compare
    split_sizes = [64, 128, 256, 512]  # In megabytes
    
    for split_size in split_sizes:
        # Run the MapReduce job with the specified input split size
        job = MRInputSplitSize(args=['--split-size', str(split_size)])
        with job.make_runner() as runner:
            runner.run()
            
            # Retrieve and display the execution time
            execution_time = list(job.parse_output(runner.cat_output()))[0][1]
            print(f"Input Split Size: {split_size} MB, Execution Time: {execution_time:.4f} seconds")
