# Dataset for ingestin in InfluxDB

In [7]:
from numpy import mean
import pandas as pd
import json
from string import punctuation
import datetime

In [8]:
# Load raw data
file_name = '../data/rawdata_20170620.json'
with open(file_name) as json_data:
    data = json.load(json_data)

In [9]:
df = pd.DataFrame(data)
df.shape, 

((2239, 8),)

## Prepare data

In [10]:
# Remove duplicates
df = df.drop_duplicates()
df.shape

(2219, 8)

In [11]:
df.head(2)

Unnamed: 0,answer,date,stars,text,title,user,user_id,verify
0,,2017-06-20 20:22:59,3,Missing a few issues. Export phone listMore he...,Missing a few issues,Otto,594984220000ff000aa60f48,True
1,"Hello JL,I have already located your complaint...",2017-06-20 02:38:57,1,"Purchased 100 minutes as advertised, but after...","Its a scam, please avoid",JL,55fc31370000ff0001dcc729,False


In [12]:
# Convert datetime to timestamp
df = df.assign(timestamp = [datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S').timestamp() \
                            for x in df['date']])

In [30]:
# Answered if there is an answer, an unanswered otherwise
def been_answered(x):
    if x is not None:
        ans = 'answered'
    else:
        ans = 'unanswered'
    return ans

df = df.assign(answered = [been_answered(x) for x in df['answer']])

In [31]:
# Verified if invited by Rebtel, an unverified otherwise
def been_verified(x):
    if x:
        ans = 'verified'
    else:
        ans = 'unverified'
    return ans

df = df.assign(verified = [been_verified(x) for x in df['verify']])

In [29]:
df.head(3)

Unnamed: 0,answer,date,stars,text,title,user,user_id,verify,timestamp,answered,verified
0,,2017-06-20 20:22:59,3,Missing a few issues. Export phone listMore he...,Missing a few issues,Otto,594984220000ff000aa60f48,True,1497983000.0,unanswered,verified
1,"Hello JL,I have already located your complaint...",2017-06-20 02:38:57,1,"Purchased 100 minutes as advertised, but after...","Its a scam, please avoid",JL,55fc31370000ff0001dcc729,False,1497919000.0,answered,unverified
2,"Hi Gaurav,We’re sorry to hear you got discoura...",2017-06-18 08:18:32,1,I was about to subscribe for their service.. B...,I was about to subscribe for their …,Gaurav Singh,5946374e0000ff000aa505da,False,1497767000.0,answered,unverified


In [32]:
# Preamble of ingestion file
mydb = 'reviews'
myretention = 'cienday'
myduration = str(100)
myreplication = str(1)


line0 = '# DDL\n'
line1 = '# CREATE DATABASE ' + mydb + '\n'
line2 = '# CREATE RETENTION POLICY ' +  myretention + \
              ' ON ' + mydb + \
              ' DURATION ' + myduration + \
              'd REPLICATION ' + myreplication + ' DEFAULT\n\n' 
line3 = '# DML\n'
line4 = '# CONTEXT-DATABASE: ' + mydb + '\n'
line5 = '# CONTEXT-RETENTION-POLICY: ' + myretention + '\n\n'


## Save data

In [33]:
file_name = '../data/reviews_influxDB2.txt'
with open(file_name, 'w') as f:
    f.write(line0) 
    f.write(line1)
    f.write(line2)
    f.write(line3)
    f.write(line4)
    f.write(line5)
    for i in range(0, df.shape[0]):
        line = 'review,answered=%s,verified=%s,stars=%s rating=%s,value=1 %s\n' % \
        (df.answered[i], df.verified[i], df.stars[i], df.stars[i], int(df.timestamp[i]))
        f.write(line)
            