In [None]:
# ==============================
# STEP 1: Install Java & Hadoop
# ==============================
!apt-get update -y
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# Download Hadoop 3.3.6
!wget -q https://downloads.apache.org/hadoop/common/hadoop-3.3.6/hadoop-3.3.6.tar.gz
!tar -xzf hadoop-3.3.6.tar.gz
!mv hadoop-3.3.6 /usr/local/hadoop

# ==============================
# STEP 2: Setup Hadoop Env
# ==============================
import os

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["HADOOP_HOME"] = "/usr/local/hadoop"
os.environ["PATH"] = f"{os.environ['HADOOP_HOME']}/bin:{os.environ['JAVA_HOME']}/bin:" + os.environ["PATH"]

!hadoop version


0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
0% [Connecting to archive.ubuntu.com (185.125.190.82)] [Connecting to security.0% [Connecting to archive.ubuntu.com (185.125.190.82)] [Connecting to security.                                                                               Get:2 https://cli.github.com/packages stable InRelease [3,917 B]
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:8 https://cli.github.com/packages stable/main amd64 Packages [346 B]
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu ja

In [None]:
# ==============================
# STEP 3: Prepare Input Files
# ==============================
!mkdir input_data

with open("input_data/input1.txt", "w") as f:
    f.write("Hello World Bye World\n")

with open("input_data/input2.txt", "w") as f:
    f.write("Hello Hadoop Goodbye Hadoop\n")

!cat input_data/*


Hello World Bye World
Hello Hadoop Goodbye Hadoop


In [None]:
%%bash
cat > mapper.py <<'EOF'
#!/usr/bin/env python3
import sys

for line in sys.stdin:
    line = line.strip()
    for word in line.split():
        print(f"{word}\t1")
EOF

cat > combiner.py <<'EOF'
#!/usr/bin/env python3
import sys
from collections import defaultdict

counts = defaultdict(int)
for line in sys.stdin:
    word, count = line.strip().split("\t")
    counts[word] += int(count)

for word, count in counts.items():
    print(f"{word}\t{count}")
EOF

cat > reducer.py <<'EOF'
#!/usr/bin/env python3
import sys

current_word = None
current_count = 0

for line in sys.stdin:
    word, count = line.strip().split("\t")
    count = int(count)

    if current_word == word:
        current_count += count
    else:
        if current_word:
            print(f"{current_word}\t{current_count}")
        current_word = word
        current_count = count

if current_word:
    print(f"{current_word}\t{current_count}")
EOF

chmod +x mapper.py combiner.py reducer.py


In [None]:
# ==============================
# STEP 5: Run Hadoop Streaming Job
# ==============================
!hadoop fs -rm -r -f input output

# Put input files into HDFS
!hadoop fs -mkdir -p input
!hadoop fs -put input_data/* input/

# Run Hadoop Streaming
!hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-*.jar \
  -input input \
  -output output \
  -mapper mapper.py \
  -combiner combiner.py \
  -reducer reducer.py \
  -file mapper.py \
  -file combiner.py \
  -file reducer.py


2025-09-25 04:37:26,098 INFO Configuration.deprecation: io.bytes.per.checksum is deprecated. Instead, use dfs.bytes-per-checksum
Deleted input
Deleted output
2025-09-25 04:37:30,587 WARN streaming.StreamJob: -file option is deprecated, please use generic option -files instead.
packageJobJar: [mapper.py, combiner.py, reducer.py] [] /tmp/streamjob7637688276975276996.jar tmpDir=null
2025-09-25 04:37:31,100 INFO impl.MetricsConfig: Loaded properties from hadoop-metrics2.properties
2025-09-25 04:37:31,192 INFO impl.MetricsSystemImpl: Scheduled Metric snapshot period at 10 second(s).
2025-09-25 04:37:31,192 INFO impl.MetricsSystemImpl: JobTracker metrics system started
2025-09-25 04:37:31,210 WARN impl.MetricsSystemImpl: JobTracker metrics system already initialized!
2025-09-25 04:37:31,392 INFO mapred.FileInputFormat: Total input files to process : 2
2025-09-25 04:37:31,410 INFO mapreduce.JobSubmitter: number of splits:2
2025-09-25 04:37:31,546 INFO mapreduce.JobSubmitter: Submitting tokens

In [None]:
# ==============================
# STEP 6: Check Output
# ==============================
!hadoop fs -cat output/part-00000


Bye	1
Goodbye	1
Hadoop	2
Hello	2
World	2
