In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import statistics
import warnings
import seaborn as sns
import random as rd
import sys
import json
import re


sys.path.append('../scripts')
from pickle_file_handlers import HandlePickle
from cleaner import CleanDataFrame
from plotter import Plotter
from logger import logger
from db_api import Database_api

In [None]:
# Create cleaner module and plotter module classes
cleaner = CleanDataFrame()
db_api = Database_api()
plotter = Plotter()
pickle_handler = HandlePickle()

In [None]:
warehouse = db_api.get_db_data_from_table_name('warehouse')

In [None]:
warehouse.info()

In [None]:
clean_warehouse = warehouse[warehouse["row_num"]==1]
clean_warehouse

In [None]:
clean_warehouse.columns

In [None]:
store_df = cleaner.fix_datatypes(clean_warehouse)

In [None]:
clean_warehouse.loc[clean_warehouse['platform_os']=="Android" , 'platform_os'] = 0
clean_warehouse.loc[clean_warehouse['platform_os'] == "Android", 'platform_os'] = 0
clean_warehouse.loc[clean_warehouse['width']=="%" , 'width'] = 300
clean_warehouse.loc[clean_warehouse['height'] == "%", 'height'] = 300
clean_warehouse['width'] = pd.to_numeric(clean_warehouse['width'], errors='coerce')
clean_warehouse['height'] = pd.to_numeric(clean_warehouse['height'], errors='coerce')
clean_warehouse[['percentage','net_cost','flat_fee','gross_costbudget','volume_agreed','buy_rate_cpe','platform_os','width','height']] = clean_warehouse[['percentage','net_cost','flat_fee', 'gross_costbudget','volume_agreed','buy_rate_cpe','platform_os','width','height']].astype("float64")

In [None]:
clean_warehouse[["device_type","type","campaign_id" , "creative_id","geo_country","site_name","agency_fee" , "serving_locations","cost_centre","currency","browser"]] = clean_warehouse[["device_type","type","campaign_id" ,"creative_id","geo_country","site_name","agency_fee" , "serving_locations","cost_centre","currency","browser"]].astype("object")

In [None]:
clean_warehouse["submission_date"] = pd.to_datetime(clean_warehouse["submission_date"])
clean_warehouse["enddate"] = pd.to_datetime(clean_warehouse["enddate"])
clean_warehouse["startdate"] = pd.to_datetime(clean_warehouse["startdate"])

In [None]:
cleaner.percent_missing(clean_warehouse)

In [None]:
clean_warehouse.info()

In [None]:
clean_warehouse.drop(columns=["flat_fee"], inplace=True)

In [None]:
clean_warehouse.isna().sum()

In [None]:
# store_df  = cleaner.fix_missing_values(store_df,['PromoInterval'],"Nan,Nan,Nan,Nan") 
clean_warehouse.loc[clean_warehouse["percentage"].isna(),"percentage"] = 100

In [None]:
clean_warehouse.loc[clean_warehouse["buy_rate_cpe"].isna(),"buy_rate_cpe"] = 0
clean_warehouse.loc[clean_warehouse["volume_agreed"].isna(),"volume_agreed"] = 0
clean_warehouse.loc[clean_warehouse["gross_costbudget"].isna(),"gross_costbudget"] = 0

In [None]:
clean_warehouse.dropna(inplace=True)
clean_warehouse.loc[clean_warehouse["agency_fee"].isna()]

In [None]:
clean_warehouse = cleaner.drop_duplicates(clean_warehouse)

In [None]:
clean_warehouse['labels'] =  clean_warehouse['labels'].apply(lambda x: json.loads('{"engagement": [], "click_through": []}') if x == None else json.loads(x.replace("'","\"")))
clean_warehouse[['labels_engagement','labels_click_through']] = clean_warehouse['labels'].apply(
    lambda x: pd.Series ([",".join(x['engagement']), ",".join(x['click_through'])]))
clean_warehouse.head(20)

In [None]:
clean_warehouse['text'] =  clean_warehouse['text'].apply(lambda x: "{'engagement': [], 'click_through': []}" if x == None else x)
clean_warehouse['text']  = clean_warehouse['text'].apply(lambda x: json.loads(re.sub( "(?<={)\'|\'(?=})|(?<=\[)\'|\'(?=\])|\'(?=:)|(?<=: )\'|\'(?=,)|(?<=, )\'", "\"", x.replace("'\"","'"))))
clean_warehouse[['text_engagement','text_click_through']] = clean_warehouse['text'].apply(
    lambda x: pd.Series ([",".join(x['engagement']), ",".join(x['click_through'])]))
clean_warehouse['text'].head(20)