In [1]:
# ============================================================
# ðŸ§© Step 1: Install Hadoop (minimal clean setup)
# ============================================================

!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/hadoop/common/hadoop-3.4.1/hadoop-3.4.1.tar.gz
!tar -xzf hadoop-3.4.1.tar.gz
!mv hadoop-3.4.1 /usr/local/hadoop

import os

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["HADOOP_HOME"] = "/usr/local/hadoop"
os.environ["PATH"] += ":/usr/local/hadoop/bin:/usr/local/hadoop/sbin"

# Configure Hadoop environment
!echo "export JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64" >> /usr/local/hadoop/etc/hadoop/hadoop-env.sh


In [6]:
# ============================================================
# ðŸ“¦ Step 2: Prepare Sample Weather Data
# ============================================================

!mkdir -p /content/weather_data

sample_data = """19010101,34
19010102,38
19010103,29
19010104,31
19020101,30
19020102,33
19020103,28
19020104,36
19030101,40
19030102,41
19030103,35
19030104,39
"""

with open("/content/weather_data/temperature.txt", "w") as f:
    f.write(sample_data)

!cat /content/weather_data/temperature.txt


19010101,34
19010102,38
19010103,29
19010104,31
19020101,30
19020102,33
19020103,28
19020104,36
19030101,40
19030102,41
19030103,35
19030104,39


Writing /content/mapper.py


In [8]:
# ============================================================
# ðŸš€ Step 4: Run MapReduce Using Hadoop Streaming (fixed)
# ============================================================

# Remove existing output directory (Hadoop can't overwrite)
!hdfs dfs -rm -r -f /content/weather_output || rm -rf /content/weather_output

# Run the MapReduce job
!hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-*.jar \
    -input /content/weather_data/temperature.txt \
    -output /content/weather_output \
    -mapper "python3 /content/mapper.py" \
    -reducer "python3 /content/reducer.py"


2025-10-13 08:28:39,668 INFO Configuration.deprecation: io.bytes.per.checksum is deprecated. Instead, use dfs.bytes-per-checksum
Deleted /content/weather_output
2025-10-13 08:28:41,521 INFO impl.MetricsConfig: Loaded properties from hadoop-metrics2.properties
2025-10-13 08:28:41,671 INFO impl.MetricsSystemImpl: Scheduled Metric snapshot period at 10 second(s).
2025-10-13 08:28:41,671 INFO impl.MetricsSystemImpl: JobTracker metrics system started
2025-10-13 08:28:41,693 WARN impl.MetricsSystemImpl: JobTracker metrics system already initialized!
2025-10-13 08:28:41,939 INFO mapred.FileInputFormat: Total input files to process : 1
2025-10-13 08:28:41,964 INFO mapreduce.JobSubmitter: number of splits:1
2025-10-13 08:28:42,215 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_local1790326985_0001
2025-10-13 08:28:42,215 INFO mapreduce.JobSubmitter: Executing with tokens: []
2025-10-13 08:28:42,423 INFO mapreduce.Job: The url to track the job: http://localhost:8080/
2025-10-13 08:2

In [9]:
# ============================================================
# ðŸ“Š Step 5: View the Output
# ============================================================

!cat /content/weather_output/part-00000


1901	Min=29	Max=38
1902	Min=28	Max=36
1903	Min=35	Max=41


In [11]:
# ============================================================
# ðŸ§© Step 1: Install Java and Apache Pig
# ============================================================

!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/pig/pig-0.17.0/pig-0.17.0.tar.gz
!tar -xzf pig-0.17.0.tar.gz
!mv pig-0.17.0 /usr/local/pig

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["PIG_HOME"] = "/usr/local/pig"
os.environ["PATH"] += ":/usr/local/pig/bin"


In [15]:
# ============================================================
# ðŸ“„ Step 2: Create Sample Employee Data
# ============================================================

data = """1,Arun,IT,35000
2,Bala,HR,28000
3,Kiran,IT,42000
4,Sara,Finance,39000
5,Devi,HR,31000
6,Ram,Finance,45000
"""

with open("/content/employee.txt", "w") as f:
    f.write(data)

!cat /content/employee.txt


1,Arun,IT,35000
2,Bala,HR,28000
3,Kiran,IT,42000
4,Sara,Finance,39000
5,Devi,HR,31000
6,Ram,Finance,45000


In [18]:
# ============================================================
# ðŸš€ Step 4: Run Pig Script Locally
# ============================================================

!pig -x local /content/experiment5.pig


2025-10-13 08:38:56,348 INFO pig.ExecTypeProvider: Trying ExecType : LOCAL
2025-10-13 08:38:56,349 INFO pig.ExecTypeProvider: Picked LOCAL as the ExecType
2025-10-13 08:38:56,439 [main] INFO  org.apache.pig.Main - Apache Pig version 0.17.0 (r1797386) compiled Jun 02 2017, 15:41:58
2025-10-13 08:38:56,439 [main] INFO  org.apache.pig.Main - Logging error messages to: /content/pig_1760344736436.log
2025-10-13 08:38:56,458 [main] INFO  org.apache.hadoop.conf.Configuration.deprecation - user.name is deprecated. Instead, use mapreduce.job.user.name
2025-10-13 08:38:56,672 [main] INFO  org.apache.pig.impl.util.Utils - Default bootup file /root/.pigbootup not found
2025-10-13 08:38:56,753 [main] INFO  org.apache.hadoop.conf.Configuration.deprecation - mapred.job.tracker is deprecated. Instead, use mapreduce.jobtracker.address
2025-10-13 08:38:56,755 [main] INFO  org.apache.pig.backend.hadoop.executionengine.HExecutionEngine - Connecting to hadoop file system at: file:///
2025-10-13 08:38:56,78

In [17]:
!cat /content/output_pig/part-r-00000


cat: /content/output_pig/part-r-00000: No such file or directory


In [19]:
# Run the script
!pig -x local /content/experiment5.pig

# View output
!cat /content/output_pig/part-r-00000


2025-10-13 08:39:23,850 INFO pig.ExecTypeProvider: Trying ExecType : LOCAL
2025-10-13 08:39:23,850 INFO pig.ExecTypeProvider: Picked LOCAL as the ExecType
2025-10-13 08:39:23,933 [main] INFO  org.apache.pig.Main - Apache Pig version 0.17.0 (r1797386) compiled Jun 02 2017, 15:41:58
2025-10-13 08:39:23,933 [main] INFO  org.apache.pig.Main - Logging error messages to: /content/pig_1760344763930.log
2025-10-13 08:39:23,953 [main] INFO  org.apache.hadoop.conf.Configuration.deprecation - user.name is deprecated. Instead, use mapreduce.job.user.name
2025-10-13 08:39:24,172 [main] INFO  org.apache.pig.impl.util.Utils - Default bootup file /root/.pigbootup not found
2025-10-13 08:39:24,273 [main] INFO  org.apache.hadoop.conf.Configuration.deprecation - mapred.job.tracker is deprecated. Instead, use mapreduce.jobtracker.address
2025-10-13 08:39:24,275 [main] INFO  org.apache.pig.backend.hadoop.executionengine.HExecutionEngine - Connecting to hadoop file system at: file:///
2025-10-13 08:39:24,30

In [28]:
# Download Hive 3.1.4
!wget https://downloads.apache.org/hive/hive-3.1.4/apache-hive-3.1.4-bin.tar.gz
!tar -xzf apache-hive-3.1.4-bin.tar.gz


--2025-10-13 08:47:35--  https://downloads.apache.org/hive/hive-3.1.4/apache-hive-3.1.4-bin.tar.gz
Resolving downloads.apache.org (downloads.apache.org)... 135.181.214.104, 88.99.208.237, 2a01:4f9:3a:2c57::2, ...
Connecting to downloads.apache.org (downloads.apache.org)|135.181.214.104|:443... connected.
HTTP request sent, awaiting response... 404 Not Found
2025-10-13 08:47:36 ERROR 404: Not Found.

tar (child): apache-hive-3.1.4-bin.tar.gz: Cannot open: No such file or directory
tar (child): Error is not recoverable: exiting now
tar: Child returned status 2
tar: Error is not recoverable: exiting now


In [31]:
import os

os.environ["HIVE_HOME"] = "/content/apache-hive-3.1.4-bin"
os.environ["PATH"] = os.environ["HIVE_HOME"] + "/bin:" + os.environ["PATH"]


In [30]:
!schematool -dbType derby -initSchema


/bin/bash: line 1: schematool: command not found


In [32]:
!ls $HIVE_HOME/bin/schematool


ls: cannot access '/content/apache-hive-3.1.4-bin/bin/schematool': No such file or directory


In [25]:
!hadoop fs -mkdir -p /user/hive/warehouse
!hadoop fs -mkdir /content/hive_data

mkdir: `/content/hive_data': File exists


In [33]:
import sqlite3
import pandas as pd

# Create a sample database
conn = sqlite3.connect('/content/company.db')
cursor = conn.cursor()

# Create employee table
cursor.execute('''
CREATE TABLE IF NOT EXISTS employee (
    id INT,
    name TEXT,
    dept TEXT,
    salary INT
)
''')

# Insert sample data
cursor.executemany('''
INSERT INTO employee (id, name, dept, salary) VALUES (?, ?, ?, ?)
''', [
    (1, 'Alice', 'HR', 35000),
    (2, 'Bob', 'IT', 40000),
    (3, 'Charlie', 'Finance', 38000)
])

conn.commit()


In [34]:
!pip install pyspark

from pyspark.sql import SparkSession
import pandas as pd

# Initialize Spark
spark = SparkSession.builder.appName("SqoopSim").getOrCreate()

# Read from SQLite
df_sqlite = pd.read_sql_query("SELECT * FROM employee", conn)
df_spark = spark.createDataFrame(df_sqlite)

# Show DataFrame (simulates HDFS storage)
df_spark.show()


+---+-------+-------+------+
| id|   name|   dept|salary|
+---+-------+-------+------+
|  1|  Alice|     HR| 35000|
|  2|    Bob|     IT| 40000|
|  3|Charlie|Finance| 38000|
+---+-------+-------+------+



In [35]:
# Register DataFrame as a Spark SQL table (simulates Hive table)
df_spark.createOrReplaceTempView("employee_hive")

# Query Hive table
spark.sql("SELECT * FROM employee_hive").show()


+---+-------+-------+------+
| id|   name|   dept|salary|
+---+-------+-------+------+
|  1|  Alice|     HR| 35000|
|  2|    Bob|     IT| 40000|
|  3|Charlie|Finance| 38000|
+---+-------+-------+------+



In [36]:
# Export DataFrame back to SQLite (simulates sqoop export)
df_export = spark.sql("SELECT * FROM employee_hive")
df_export.toPandas().to_sql("employee_backup", conn, if_exists='replace', index=False)

# Verify
pd.read_sql_query("SELECT * FROM employee_backup", conn)


Unnamed: 0,id,name,dept,salary
0,1,Alice,HR,35000
1,2,Bob,IT,40000
2,3,Charlie,Finance,38000
