初始化hbase中接收用户数据的相关库，相应的构造hive的外接表，这样我们可以通过spark-streaming绑定到hive上进行数据插入
需要保证hbase和hive的thrift服务是启动的：
${HBASE_HOME}/bin/hbase-daemon.sh start thrift -threadpool
${HIVE_HOME}/bin/hive --service hiveserver2

In [1]:
class Conf:
    pass
conf = Conf()
conf.hbase_db = "user_list_click"
conf.hive_db = "resys_one"
conf.hive_table = "user_list_click"
conf.__dict__

{'hbase_db': 'user_list_click',
 'hive_db': 'resys_one',
 'hive_table': 'user_list_click'}

In [8]:
# 首先构建这个hbase数据库和对应的hive外接表
import happybase
conn = happybase.Connection('localhost', autoconnect=True)
conn.open()
table_list = [ t.decode() for t in conn.tables()]
table_list

['hive_test_hbase',
 'spark',
 'test',
 'user_list_click',
 'videobase_all',
 'zzti']

In [9]:
if conf.hbase_db not in table_list:
    conn.create_table(conf.hbase_db, {'d': dict(max_versions=1, block_cache_enabled=False)})
table = conn.table(conf.hbase_db)
table.families()

{b'd': {'name': b'd:',
  'max_versions': 1,
  'compression': b'NONE',
  'in_memory': False,
  'bloom_filter_type': b'NONE',
  'bloom_filter_vector_size': 0,
  'bloom_filter_nb_hashes': 0,
  'block_cache_enabled': False,
  'time_to_live': 2147483647}}

In [10]:
# 创建这个hbase库对应的hive表
from pyhive import hive
cursor = hive.connect("localhost").cursor()
cursor

<pyhive.hive.Cursor at 0x7f46408cb510>

user_list_click向kafka传递的数据格式为：
{
    "click":"470030960", # 保存用户点击的文章
    "clickTime":1639130181018, # 点击行为发生的时间戳
    "id":"2021121056279100961-470030960", # 这一次点击行为的id号，该id号是独立，且符合字典序排序的
    "list":["410057990","410240504","410449868","410500249","410553196","410925946","411194717","470030960","470162792","470216246","470300594","470595946","470899479","470917063","471752404"], # 用户点击这个文章的时候，呈现的文章是什么
    "recid":"2021121056279100961", # 每一次推荐，都会有一个recid，用户可能多次点击，但recid是不变的
    "userid":"0000000001" # 用户的id号
}


In [12]:
# 如果需要的话，先删了之前的表
cursor.execute("drop table if exists {0}.{1}".format(conf.hive_db, conf.hive_table))

In [13]:
# 创建外链表，为了便于后续操作，hive中保存的都是字符串，至于字段含义，可以后续进行处理
cursor.execute("create database if not exists " + conf.hive_db )
external_sql="""
 create external table if not exists {0}.{1} (
    id string,
    click string,
    clickTime string,
    list string,
    recid string,
    userid string)
 STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
 WITH SERDEPROPERTIES ("hbase.columns.mapping" = "d:click,
                                                  d:clickTime,
                                                  d:list,
                                                  d:recid,
                                                  d:userid")
 TBLPROPERTIES("hbase.table.name" = "{2}")
""".format(conf.hive_db, conf.hive_table, conf.hbase_db)
print(external_sql)
cursor.execute(external_sql)


 create external table if not exists resys_one.user_list_click (
    id string,
    click string,
    clickTime string,
    list string,
    recid string,
    userid string)
 STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
 WITH SERDEPROPERTIES ("hbase.columns.mapping" = "d:click,
                                                  d:clickTime,
                                                  d:list,
                                                  d:recid,
                                                  d:userid")
 TBLPROPERTIES("hbase.table.name" = "user_list_click")



这里没有设计模拟数据的环节，但也可以进行数据模拟

In [15]:
conn.close()
cursor.close()