In [1]:
# Install Java (required by Hadoop)
!apt-get install openjdk-11-jdk-headless -qq > /dev/null

# Download Hadoop 3.3.6 (latest stable)
!wget -q https://downloads.apache.org/hadoop/common/hadoop-3.3.6/hadoop-3.3.6.tar.gz
!tar -xzf hadoop-3.3.6.tar.gz
!mv hadoop-3.3.6 /usr/local/hadoop


In [2]:
import os

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["HADOOP_HOME"] = "/usr/local/hadoop"
os.environ["PATH"] += ":/usr/local/hadoop/bin:/usr/local/hadoop/sbin"


In [3]:
!hadoop version


Hadoop 3.3.6
Source code repository https://github.com/apache/hadoop.git -r 1be78238728da9266a4f88195058f08fd012bf9c
Compiled by ubuntu on 2023-06-18T08:22Z
Compiled on platform linux-x86_64
Compiled with protoc 3.7.1
From source with checksum 5652179ad55f76cb287d9c633bb53bbd
This command was run using /usr/local/hadoop/share/hadoop/common/hadoop-common-3.3.6.jar


In [4]:
%%writefile input.txt
Hadoop MapReduce is a programming model
for processing large data sets
with a distributed algorithm
on a cluster
MapReduce programs are written in various languages


Writing input.txt


In [5]:
%%writefile mapper.py
#!/usr/bin/env python3
import sys
for line in sys.stdin:
    for word in line.strip().split():
        print(f"{word.lower()}\t1")


Writing mapper.py


In [6]:
%%writefile reducer.py
#!/usr/bin/env python3
import sys
from itertools import groupby
from operator import itemgetter

data = (line.strip().split("\t") for line in sys.stdin)
for word, group in groupby(sorted(data, key=itemgetter(0)), key=itemgetter(0)):
    total = sum(int(count) for _, count in group)
    print(f"{word}\t{total}")


Writing reducer.py


In [7]:
!chmod +x mapper.py reducer.py


In [8]:
!hdfs namenode -format
!start-dfs.sh
!hadoop fs -mkdir /input
!hadoop fs -put input.txt /input
!hadoop fs -ls /input


2025-10-15 11:21:15,771 INFO namenode.NameNode: STARTUP_MSG: 
/************************************************************
STARTUP_MSG: Starting NameNode
STARTUP_MSG:   host = 5a853759c36f/172.28.0.12
STARTUP_MSG:   args = [-format]
STARTUP_MSG:   version = 3.3.6
STARTUP_MSG:   classpath = /usr/local/hadoop/etc/hadoop:/usr/local/hadoop/share/hadoop/common/lib/jetty-util-9.4.51.v20230217.jar:/usr/local/hadoop/share/hadoop/common/lib/jetty-security-9.4.51.v20230217.jar:/usr/local/hadoop/share/hadoop/common/lib/failureaccess-1.0.jar:/usr/local/hadoop/share/hadoop/common/lib/jackson-core-asl-1.9.13.jar:/usr/local/hadoop/share/hadoop/common/lib/metrics-core-3.2.4.jar:/usr/local/hadoop/share/hadoop/common/lib/netty-codec-4.1.89.Final.jar:/usr/local/hadoop/share/hadoop/common/lib/netty-codec-smtp-4.1.89.Final.jar:/usr/local/hadoop/share/hadoop/common/lib/guava-27.0-jre.jar:/usr/local/hadoop/share/hadoop/common/lib/j2objc-annotations-1.1.jar:/usr/local/hadoop/share/hadoop/common/lib/snappy-ja

In [9]:
!hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-3.3.6.jar \
    -input /input/input.txt \
    -output /output \
    -mapper mapper.py \
    -reducer reducer.py


2025-10-15 11:21:49,274 INFO impl.MetricsConfig: Loaded properties from hadoop-metrics2.properties
2025-10-15 11:21:49,544 INFO impl.MetricsSystemImpl: Scheduled Metric snapshot period at 10 second(s).
2025-10-15 11:21:49,545 INFO impl.MetricsSystemImpl: JobTracker metrics system started
2025-10-15 11:21:49,572 WARN impl.MetricsSystemImpl: JobTracker metrics system already initialized!
2025-10-15 11:21:49,944 INFO mapred.FileInputFormat: Total input files to process : 1
2025-10-15 11:21:49,983 INFO mapreduce.JobSubmitter: number of splits:1
2025-10-15 11:21:50,473 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_local874769224_0001
2025-10-15 11:21:50,473 INFO mapreduce.JobSubmitter: Executing with tokens: []
2025-10-15 11:21:50,930 INFO mapreduce.Job: The url to track the job: http://localhost:8080/
2025-10-15 11:21:50,935 INFO mapreduce.Job: Running job: job_local874769224_0001
2025-10-15 11:21:50,943 INFO mapred.LocalJobRunner: OutputCommitter set in config null
2025-10-1

In [10]:
!hadoop fs -cat /output/part-00000


a	3
algorithm	1
are	1
cluster	1
data	1
distributed	1
for	1
hadoop	1
in	1
is	1
languages	1
large	1
mapreduce	2
model	1
on	1
processing	1
programming	1
programs	1
sets	1
various	1
with	1
written	1
