# Initial definitions

In [59]:
%env HADOOP_VERSION     2.9.2
%env HADOOP_PATH hadoop-2.9.2

env: HADOOP_VERSION=2.9.2
env: HADOOP_PATH=hadoop-2.9.2


# Preparing the environment

## Downloading Hadoop

In [60]:
!wget http://ftp.unicamp.br/pub/apache/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz -q --show-progress

/bin/sh: 1: wget: not found


## Extracting compressed files and removing .tar

In [61]:
# !rm ${HADOOP_PATH} -r
!tar -xvf hadoop-${HADOOP_VERSION}.tar.gz >/dev/null 
!rm       hadoop-${HADOOP_VERSION}.tar.gz

/bin/sh: 1: tar: not found
/bin/sh: 1: rm: not found


## Discovering the Java path

In [62]:
!dirname $(dirname $(readlink -f $(which javac)))

/bin/sh: 1: which: not found
/bin/sh: 1: readlink: not found
/bin/sh: 1: dirname: not found
/bin/sh: 1: dirname: not found


## Setting the Java path envvar

We also added it to user's .bashrc so it will be loaded as the nodes perform ssh connections.

In [63]:
%env JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64

env: JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64


In [64]:
!echo "export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 " >> ~/.bashrc
!echo "export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 " >> ~/.profile
!echo "export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 " >> ${HADOOP_PATH}/etc/hadoop/hadoop-env.sh

# Hadoop in Standalone Mode (local)

## MapReduce in the local filesystem - word count example

In [65]:
!${HADOOP_PATH}/bin/hadoop jar ${HADOOP_PATH}/share/hadoop/mapreduce/hadoop-mapreduce-examples-${HADOOP_VERSION}.jar wordcount \
                               ./resources/examples/newyorknewyork.txt ./output

/usr/bin/env: ‘bash’: No such file or directory


### Listing files in the output folder

In [66]:
!ls ./output/

/bin/sh: 1: ls: not found


### Reading output file

In [67]:
! cat ./output/part-r-00000

/bin/sh: 1: cat: not found


# Hadoop in Pseudo-Distributed Mode

## Preparing the environment

### Starting sshd server

Check `/binder/postBuild` and `/resources/configs/ssh/sshd_config` files for more details

In [68]:
!/usr/sbin/sshd -f resources/configs/ssh/sshd_config 

### Adding names to know hosts 

Commands below stablish ssh connections to used host names/ips. This step avoids yes/no host confirmation.

In [69]:
!ssh -o "StrictHostKeyChecking no" $USER@localhost -p 8822 -C "exit" 
!ssh -o "StrictHostKeyChecking no" $USER@0.0.0.0   -p 8822 -C "exit"

/bin/sh: 1: ssh: not found
/bin/sh: 1: ssh: not found


### Adding ssh options to Hadoop via envvar

* connecting in a diferent port (`-p 8822`)
* avoiding host key checking (`-o StrictHostKeyChecking=no`)

In [70]:
%env HADOOP_SSH_OPTS= -o StrictHostKeyChecking=no -p 8822

env: HADOOP_SSH_OPTS=-o StrictHostKeyChecking=no -p 8822


In [71]:
%env PDSH_RCMD_TYPE ssh

env: PDSH_RCMD_TYPE=ssh


### Copying configurations files to Hadoop folder

Check the configuration files accordingly to the Hadoop version. 
Refer to the `/resources/configs/hadoop/<version>`.

In [72]:
!cp resources/configs/hadoop/${HADOOP_VERSION}/core-site.xml   ${HADOOP_PATH}/etc/hadoop/
!cp resources/configs/hadoop/${HADOOP_VERSION}/hdfs-site.xml   ${HADOOP_PATH}/etc/hadoop/

/bin/sh: 1: cp: not found
/bin/sh: 1: cp: not found


## Formatting the filesystem

In [73]:
!${HADOOP_PATH}/bin/hdfs namenode -format -force -nonInteractive

/usr/bin/env: ‘bash’: No such file or directory


## Starting DFS (NameNode, SecondaryNameNode, and DataNode daemons)

In [74]:
!${HADOOP_PATH}/sbin/start-dfs.sh
!jps

/usr/bin/env: ‘bash’: No such file or directory
/bin/sh: 1: jps: not found


## MapReduce - Word count example 

### Creating folders in the distributed file system

In [75]:
!${HADOOP_PATH}/bin/hdfs dfs -mkdir /user/
!${HADOOP_PATH}/bin/hdfs dfs -mkdir /user/matheus/
!${HADOOP_PATH}/bin/hdfs dfs -mkdir /user/matheus/input/

/usr/bin/env: ‘bash’: No such file or directory
/usr/bin/env: ‘bash’: No such file or directory
/usr/bin/env: ‘bash’: No such file or directory


### Copying a file to a folder in the distributed file system

In [76]:
!${HADOOP_PATH}/bin/hdfs dfs -put ./resources/examples/newyorknewyork.txt /user/matheus/input/

/usr/bin/env: ‘bash’: No such file or directory


### Listing files in a folder of the distributed file system

In [77]:
!${HADOOP_PATH}/bin/hdfs dfs -ls /user/matheus/input/

/usr/bin/env: ‘bash’: No such file or directory


### Retrieving the contents of a file in the distributed file system

In [78]:
!${HADOOP_PATH}/bin/hdfs dfs -cat /user/matheus/input/newyorknewyork.txt

/usr/bin/env: ‘bash’: No such file or directory


### Running MapReduce job in Pseudo-Distributed Mode

In [79]:
!./${HADOOP_PATH}/bin/hadoop jar  ./${HADOOP_PATH}/share/hadoop/mapreduce/hadoop-mapreduce-examples-${HADOOP_VERSION}.jar wordcount \
                                /user/matheus/input /user/matheus/output

/usr/bin/env: ‘bash’: No such file or directory


### Listing files in the output folder

In [80]:
!./${HADOOP_PATH}/bin/hdfs dfs -ls /user/matheus/output/

/usr/bin/env: ‘bash’: No such file or directory


### Reading output file

In [81]:
!./${HADOOP_PATH}/bin/hdfs dfs -cat /user/matheus/output/part-r-00000

/usr/bin/env: ‘bash’: No such file or directory


# Starting YARN in Pseudo-Distributed Mode

## Preparing the environment

### Copying configurations files to Hadoop folder

In [82]:
!cp resources/configs/hadoop/${HADOOP_VERSION}/mapred-site.xml ${HADOOP_PATH}/etc/hadoop/
!cp resources/configs/hadoop/${HADOOP_VERSION}/yarn-site.xml   ${HADOOP_PATH}/etc/hadoop/

/bin/sh: 1: cp: not found
/bin/sh: 1: cp: not found


## Starting YARN

In [83]:
!${HADOOP_PATH}/sbin/start-yarn.sh
!jps

/usr/bin/env: ‘bash’: No such file or directory
/bin/sh: 1: jps: not found


## MapReduce via YARN - Word count example 

In [84]:
!./${HADOOP_PATH}/bin/yarn jar  ./${HADOOP_PATH}/share/hadoop/mapreduce/hadoop-mapreduce-examples-${HADOOP_VERSION}.jar wordcount \
                                /user/matheus/input /user/matheus/output2

/usr/bin/env: ‘bash’: No such file or directory


### Listing files in the output folder

In [85]:
!./${HADOOP_PATH}/bin/hdfs dfs -ls /user/matheus/output2/

/usr/bin/env: ‘bash’: No such file or directory


### Reading output file

In [86]:
!./${HADOOP_PATH}/bin/hdfs dfs -cat /user/matheus/output2/part-r-00000

/usr/bin/env: ‘bash’: No such file or directory


# Hive Initial definitions

In [None]:
%env HIVE_VERSION     hive-2.3.5
%env HIVE_PATH apache-hive-2.3.5-bin

# Preparing the Enviroment

## Downloading Hive

In [None]:
!wget http://ftp.unicamp.br/pub/apache/hive/${HIVE_VERSION}/${HIVE_PATH}.tar.gz -q --show-progress

## Extracting compressed files and removing .tar

In [None]:
!tar -xvf ${HIVE_PATH}.tar.gz >/dev/null 
!rm       ${HIVE_PATH}.tar.gz

In [None]:
!ls

## Setting the Hive path envvar

In [None]:
%env JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64

%env HIVE_HOME=/home/jovyan/${HIVE_PATH}
%env PATH=$PATH:$JAVA_HOME/bin:$HADOOP_PATH/bin:$HIVE_HOME/bin:$HIVE_HOME/conf
!export JAVA_HOME CLASS_PATH PATH HIVE_HOME

!echo "export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 " >> ~/.bashrc
!echo "export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 " >> ~/.profile
!echo "export HIVE_HOME=/home/jovyan/$HIVE_PATH " >> ~/.bashrc
!echo "export HIVE_HOME=/home/jovyan/$HIVE_PATH " >> ~/.profile
!echo "export PATH=$PATH:$JAVA_HOME/bin:$HADOOP_PATH/bin:$HIVE_HOME/bin:$HIVE_HOME/conf " >> ~/.bashrc
!echo "export PATH=$PATH:$JAVA_HOME/bin:$HADOOP_PATH/bin:$HIVE_HOME/bin:$HIVE_HOME/conf " >> ~/.profile
!source /etc/profile

#!echo "export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 " >> ${HADOOP_PATH}/etc/hadoop/hadoop-env.sh

## Configure hive-env.sh

In [None]:
!echo "export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64" > /home/jovyan/${HIVE_PATH}/bin/hive-env.sh
!echo "export HADOOP_HOME=$HADOOP_PATH" >> /home/jovyan/${HIVE_PATH}/bin/hive-env.sh
!echo "export HIVE_HOME=/home/jovyan/$HIVE_PATH" >> /home/jovyan/${HIVE_PATH}/bin/hive-env.sh

# Hive Configuration Directory can be controlled by:
!echo "export HIVE_CONF_DIR=$HIVE_HOME/conf" >> /home/jovyan/${HIVE_PATH}/bin/hive-env.sh

# Folder containing extra libraries required for hive compilation/execution can be controlled by:
!echo "export HIVE_AUX_JARS_PATH=$HIVE_HOME/lib/*" >> /home/jovyan/${HIVE_PATH}/bin/hive-env.sh

## Create HDFS directories

In [None]:
#!./hadoop-2.9.2/bin/hdfs dfs -mkdir /user
#!./${HADOOP_PATH}/bin/hdfs dfs -mkdir /user/hive
#!./${HADOOP_PATH}/bin/hdfs dfs -mkdir /user/hive/warehouse
#!./${HADOOP_PATH}/bin/hdfs dfs -mkdir /tmp/hive
#!./${HADOOP_PATH}/bin/hdfs dfs -chmod  777 /user/hive/warehouse
#!./${HADOOP_PATH}/bin/hdfs dfs -chmod  777 /tmp/hive
!./${HADOOP_PATH}/bin/hdfs dfs -ls /user

## Create database to store the metadata

In [None]:
!.$HIVE_HOME/bin/schematool –initschema –dbtype derby

In [None]:
!hive