In [1]:
%load_ext dockermagic

# Hive
![Hive](https://hive.apache.org/images/hive_logo_medium.jpg)

- https://hive.apache.org/

## Setup

- version 3.1.3

In [2]:
%%dockerexec hadoop

# Download package
mkdir -p /opt/pkgs
cd /opt/pkgs
wget -q -c https://downloads.apache.org/hive/hive-3.1.3/apache-hive-3.1.3-bin.tar.gz

# unpack file and create link
tar -zxf apache-hive-3.1.3-bin.tar.gz -C /opt
ln -s /opt/apache-hive-3.1.3-bin /opt/hive

# update envvars.sh
cat >> /opt/envvars.sh << EOF
# Hive
export HIVE_HOME=/opt/hive
export PATH=\${PATH}:\${HIVE_HOME}/bin

EOF

# Fix slf4j
rm /opt/hive/lib/log4j-slf4j-impl-2.17.1.jar

cat /opt/envvars.sh

export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-amd64
export PDSH_RCMD_TYPE=ssh
export HADOOP_HOME=/opt/hadoop
export HADOOP_VERSION=3.3.6
export HADOOP_COMMON_HOME=${HADOOP_HOME}
export HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop
export HADOOP_HDFS_HOME=${HADOOP_HOME}
export HADOOP_MAPRED_HOME=${HADOOP_HOME}
export HADOOP_YARN_HOME=${HADOOP_HOME}
export PATH=${PATH}:${HADOOP_HOME}/bin:${HADOOP_HOME}/sbin
# Hive
export HIVE_HOME=/opt/hive
export PATH=${PATH}:${HIVE_HOME}/bin



## Hadoop configuration (for beeline)

- core-site.xml

```xml
<configuration>
...
<property>
  <name>hadoop.proxyuser.hadoop.groups</name>
  <value>*</value>
</property>
<property>
  <name>hadoop.proxyuser.hadoop.hosts</name>
  <value>*</value>
</property>
</configuration>
```

## Hive Metastore

- using local Derby database

### Create directory in HDFS

In [3]:
%%dockerexec hadoop

source /opt/envvars.sh

hdfs dfs -mkdir -p /user/hive/warehouse
hdfs dfs -chmod g+w /user/hive/warehouse

### Initialize database

In [4]:
%%dockerexec hadoop

source /opt/envvars.sh

mkdir -p $HIVE_HOME/hiveserver2
cd $HIVE_HOME/hiveserver2
$HIVE_HOME/bin/schematool -dbType derby -initSchema 2> /dev/null

Metastore connection URL:	 jdbc:derby:;databaseName=metastore_db;create=true
Metastore Connection Driver :	 org.apache.derby.jdbc.EmbeddedDriver
Metastore connection User:	 APP
Starting metastore schema initialization to 3.1.0
Initialization script hive-schema-3.1.0.derby.sql
Initialization script completed
schemaTool completed


### Start hiveserver2

In [5]:
%%dockerexec hadoop

source /opt/envvars.sh

cd /opt/hive/hiveserver2
nohup /opt/hive/bin/hive --service hiveserver2 \
--hiveconf hive.security.authorization.createtable.owner.grants=ALL \
--hiveconf hive.root.logger=INFO,console > hiveserver2.out 2>&1 &
echo $! > hiveserver2.pid

## Example

- SF Bay Area Bike Share (https://www.kaggle.com/benhamner/sf-bay-area-bike-share)
- stations.csv and trips.csv

In [8]:
%%dockerexec hadoop

source /opt/envvars.sh

mkdir -p /opt/datasets_hive

In [9]:
%%bash

# copy datasets used by hive examples to hadoop container
docker cp hivedataset.tgz hadoop:/opt/datasets_hive

In [10]:
%%dockerexec hadoop

source /opt/envvars.sh

cd /opt/datasets_hive
tar -zxf hivedataset.tgz
rm hivedataset.tgz
ls

hdfs dfs -mkdir -p bikeshare/stations
hdfs dfs -put stations.csv bikeshare/stations
hdfs dfs -mkdir -p bikeshare/trips
hdfs dfs -put trips.csv bikeshare/trips

stations.csv
trips.csv


## Using beeline

In [11]:
%%dockerwrite hadoop /opt/script.sql

-- configure jobs executor
SET hive.execution.engine=mr;
SET mapreduce.framework.name=yarn;

-- create bikeshare database
CREATE DATABASE bikeshare;
SHOW DATABASES;

[sPreparing to copy...[?25l[u[2KCopying to container - 0B[?25h[u[2KSuccessfully copied 2.05kB to hadoop:/opt/script.sql


In [12]:
%%dockerexec hadoop

source /opt/envvars.sh

beeline -n hadoop -u jdbc:hive2://localhost:10000 --silent=true -f /opt/script.sql

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/opt/apache-hive-3.1.3-bin/lib/log4j-slf4j-impl-2.17.1.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/opt/hadoop-3.3.6/share/hadoop/common/lib/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]







+----------------+
| database_name  |
+----------------+
| bikeshare      |
| default        |
+----------------+


In [14]:
%%dockerwrite hadoop /opt/script.sql

USE bikeshare;

-- create stations table
CREATE EXTERNAL TABLE stations (
    station_id INT,
    name STRING,
    lat DOUBLE,
    long DOUBLE,
    dockcount INT,
    landmark STRING,
    installation STRING
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE
LOCATION 'hdfs:///user/hadoop/bikeshare/stations';

-- create trips table
CREATE EXTERNAL TABLE trips (
    trip_id INT,
    duration INT,
    start_date STRING,
    start_station STRING,
    start_terminal INT,
    end_date STRING,
    end_station STRING,
    end_terminal INT,
    bike_num INT,
    subscription_type STRING,
    zip_code STRING
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE
LOCATION 'hdfs:///user/hadoop/bikeshare/trips';

-- show tables
SHOW TABLES;

[sPreparing to copy...[?25l[u[2KCopying to container - 0B[?25h[u[2KSuccessfully copied 2.56kB to hadoop:/opt/script.sql


In [15]:
%%dockerexec hadoop

source /opt/envvars.sh

beeline -n hadoop -u jdbc:hive2://localhost:10000 --silent=true -f /opt/script.sql







































+-----------+
| tab_name  |
+-----------+
| stations  |
| trips     |
+-----------+


In [16]:
%%dockerwrite hadoop /opt/script.sql

USE bikeshare;

DESCRIBE stations;
DESCRIBE trips;
DESCRIBE FORMATTED stations;
DESCRIBE FORMATTED trips;

[sPreparing to copy...[?25l[u[2KCopying to container - 0B[?25h[u[2KSuccessfully copied 2.05kB to hadoop:/opt/script.sql


In [17]:
%%dockerexec hadoop

source /opt/envvars.sh

beeline -n hadoop -u jdbc:hive2://localhost:10000 --silent=true -f /opt/script.sql




+---------------+------------+----------+
|   col_name    | data_type  | comment  |
+---------------+------------+----------+
| station_id    | int        |          |
| name          | string     |          |
| lat           | double     |          |
| long          | double     |          |
| dockcount     | int        |          |
| landmark      | string     |          |
| installation  | string     |          |
+---------------+------------+----------+

+--------------------+------------+----------+
|      col_name      | data_type  | comment  |
+--------------------+------------+----------+
| trip_id            | int        |          |
| duration           | int        |          |
| start_date         | string     |          |
| start_station      | string     |          |
| start_terminal     | int        |          |
| end_date           | string     |          |
| end_station        | string     |          |
| end_terminal       | int        |          |
| bike_num       

In [18]:
%%dockerwrite hadoop /opt/script.sql

USE bikeshare;

-- query - number of trips per terminal
SELECT start_terminal, start_station, COUNT(1) AS count
FROM trips
GROUP BY start_terminal, start_station
ORDER BY count
DESC LIMIT 10;

[sPreparing to copy...[?25l[u[2KCopying to container - 0B[?25h[u[2KSuccessfully copied 2.05kB to hadoop:/opt/script.sql


In [19]:
%%dockerexec hadoop

source /opt/envvars.sh

beeline -n hadoop -u jdbc:hive2://localhost:10000 --silent=true -f /opt/script.sql









+-----------------+------------------------------------------------+--------+
| start_terminal  |                 start_station                  | count  |
+-----------------+------------------------------------------------+--------+
| 70              | San Francisco Caltrain (Townsend at 4th)       | 26304  |
| 69              | San Francisco Caltrain 2 (330 Townsend)        | 21758  |
| 50              | Harry Bridges Plaza (Ferry Building)           | 17255  |
| 55              | Temporary Transbay Terminal (Howard at Beale)  | 14436  |
| 60              | Embarcadero at Sansome                         | 14158  |
| 61              | 2nd at Townsend                                | 14026  |
| 65              | Townsend at 7th                                | 13752  |
| 74              | Steuart at Market                              | 13687  |
| 67              | Market at 10th                                 | 11885  |
| 77              | Market at Sansome                   

In [20]:
%%dockerwrite hadoop /opt/script.sql

USE bikeshare;

-- query - join between stations and trips
SELECT t.trip_id, t.duration, t.start_date, s.name, s.lat, s.long, s.landmark
FROM stations s
JOIN trips t ON s.station_id = t.start_terminal
LIMIT 10;

[sPreparing to copy...[?25l[u[2KCopying to container - 0B[?25h[u[2KSuccessfully copied 2.05kB to hadoop:/opt/script.sql


In [21]:
%%dockerexec hadoop

source /opt/envvars.sh

beeline -n hadoop -u jdbc:hive2://localhost:10000 --silent=true -f /opt/script.sql








+------------+-------------+------------------+------------------------------------------------+------------+--------------+----------------+
| t.trip_id  | t.duration  |   t.start_date   |                     s.name                     |   s.lat    |    s.long    |   s.landmark   |
+------------+-------------+------------------+------------------------------------------------+------------+--------------+----------------+
| 913460     | 765         | 8/31/2015 23:26  | Harry Bridges Plaza (Ferry Building)           | 37.795392  | -122.394203  | San Francisco  |
| 913459     | 1036        | 8/31/2015 23:11  | San Antonio Shopping Center                    | 37.400443  | -122.108338  | Mountain View  |
| 913455     | 307         | 8/31/2015 23:13  | Post at Kearney                                | 37.788975  | -122.403452  | San Francisco  |
| 913454     | 409         | 8/31/2015 23:10  | San Jose City Hall                             | 37.337391  | -121.886995  | San Jose       |

## WordCount using Hive

In [6]:
%%dockerexec hadoop

source /opt/envvars.sh

mkdir -p /opt/datasets_hive
cd /opt/datasets_hive

wget -q -c https://tinyurl.com/y68jxy7f -O stop-word-list.csv
hdfs dfs -mkdir -p stopwords
hdfs dfs -put stop-word-list.csv stopwords
hdfs dfs -cat stopwords/stop-word-list.csv

# download book "The Complete Works of William Shakespeare, by William Shakespeare" from Gutenberg Project
wget -q -c http://www.gutenberg.org/files/100/100-0.txt -O shakespeare.txt

# create directory in HDFS and put file
hdfs dfs -mkdir -p shakespeare
hdfs dfs -put shakespeare.txt shakespeare
hdfs dfs -ls -h shakespeare

bash: line 3: cd: /opt/datasets_hive: No such file or directory
a, able, about, across, after, all, almost, also, am, among, an, and, any, are, as, at, be, because, been, but, by, can, cannot, could, dear, did, do, does, either, else, ever, every, for, from, get, got, had, has, have, he, her, hers, him, his, how, however, i, if, in, into, is, it, its, just, least, let, like, likely, may, me, might, most, must, my, neither, no, nor, not, of, off, often, on, only, or, other, our, own, rather, said, say, says, she, should, since, so, some, than, that, the, their, them, then, there, these, they, this, tis, to, too, twas, us, wants, was, we, were, what, when, where, which, while, who, whom, why, will, with, would, yet, you, yourFound 1 items
-rw-r--r--   2 hadoop hadoop      5.4 M 2023-12-18 22:05 shakespeare/shakespeare.txt


In [7]:
%%dockerwrite hadoop /opt/wordcount.sql

CREATE TABLE shakespeare_text (line STRING);
LOAD DATA INPATH '/user/hadoop/shakespeare/shakespeare.txt' INTO TABLE shakespeare_text;

CREATE TABLE stopwords (word STRING);
CREATE TABLE tempwords (line STRING);
LOAD DATA INPATH '/user/hadoop/stopwords/stop-word-list.csv' INTO TABLE tempwords;

-- split comma-separated stopwords to rows
INSERT INTO stopwords
SELECT word
FROM tempwords
LATERAL VIEW explode(split(line, ',')) t AS word;
DROP TABLE tempwords;

[sPreparing to copy...[?25l[u[2KCopying to container - 0B[?25h[u[2KSuccessfully copied 2.05kB to hadoop:/opt/wordcount.sql


In [8]:
%%dockerexec hadoop

source /opt/envvars.sh

beeline -n hadoop -u jdbc:hive2://localhost:10000 --silent=true -f /opt/wordcount.sql


 



 











+----------------------------------------------------+
|               shakespeare_text.line                |
+----------------------------------------------------+
| The Project Gutenberg eBook of The Complete Works of William Shakespeare, by William Shakespeare |
|                                                    |
| This eBook is for the use of anyone anywhere in the United States and |
| most other parts of the world at no cost and with almost no restrictions |
| whatsoever. You may copy it, give it away or re-use it under the terms |
| of the Project Gutenberg License included with this eBook or online at |
| www.gutenberg.org. If you are not located in the United States, you |
| will have to check the laws of the country where you are located before |
| using this eBook.                                  |
|                                                    |
+----------------------------------------------------+




+-----------------+
| stopwords.word  |
+-

In [11]:
%%dockerwrite hadoop /opt/wordcount.sql

SELECT w.word, count(1) AS count
FROM (
    SELECT explode(split(regexp_replace(lower(line), '[^a-z\\s]', ''), '\\s+')) AS word
    FROM shakespeare_text
) w
LEFT OUTER JOIN (
    SELECT lower(trim(word)) AS word
    FROM stopwords
) s ON w.word = s.word
WHERE s.word IS NULL AND w.word != ''
GROUP BY w.word
ORDER BY count DESC
LIMIT 30;

[sPreparing to copy...[?25l[u[2KCopying to container - 0B[?25h[u[2KSuccessfully copied 2.05kB to hadoop:/opt/wordcount.sql


In [12]:
%%dockerexec hadoop

source /opt/envvars.sh

beeline -n hadoop -u jdbc:hive2://localhost:10000 --silent=true -f /opt/wordcount.sql



 










+---------+--------+
| w.word  | count  |
+---------+--------+
| thou    | 5856   |
| thy     | 4353   |
| shall   | 3851   |
| thee    | 3416   |
| lord    | 3124   |
| king    | 3019   |
| now     | 3014   |
| sir     | 2976   |
| good    | 2962   |
| o       | 2769   |
| well    | 2627   |
| come    | 2625   |
| more    | 2515   |
| enter   | 2408   |
| love    | 2291   |
| here    | 2263   |
| ill     | 2146   |
| hath    | 2062   |
| one     | 1959   |
| man     | 1911   |
| upon    | 1866   |
| go      | 1796   |
| make    | 1789   |
| know    | 1759   |
| scene   | 1631   |
| see     | 1552   |
| such    | 1532   |
| out     | 1472   |
| give    | 1423   |
| first   | 1383   |
+---------+--------+


In [None]:
%%dockerexec hadoop

cd /opt/hive/hiveserver2

# kill hiveserver2
kill $(cat hiveserver2.pid)
rm hiveserver2.pid