# Prepare ASIF dataset for OP productivity

This notebook has been generated on 2019-10-05 07:45 

The objective of this notebook is to YYY

## Global steps 

The global steps to construct the dataset are the following:


- From BigQuery
    - Select year 1998-2007 ASIF


## Data source 

The data source to construct the dataset are the following:

### Big Query Dataset 
 
 - asif_firm_china 

In [None]:
from Fast_connectCloud import connector
from GoogleDrivePy.google_drive import connect_drive
from GoogleDrivePy.google_platform import connect_cloud_platform
import pandas as pd 
import numpy as np
import pandas_profiling
import plotly.express as px

In [None]:
gs = connector.open_connection(online_connection = False, 
	path_credential = '/Users/Thomas/Google Drive/Projects/Data_science/Google_code_n_Oauth/Client_Oauth/Google_auth/')

service_gd = gs.connect_remote(engine = 'GS')
service_gcp = gs.connect_remote(engine = 'GCP')

gdr = connect_drive.connect_drive(service_gd['GoogleDrive'])

project = 'valid-pagoda-132423'
gcp = connect_cloud_platform.connect_console(project = project,
											 service_account = service_gcp['GoogleCloudP'])

## Load asif_firm_china from Google Big Query

Feel free to add description about the dataset or any usefull information.

In [None]:
query = (
    "SELECT newID, year,bdat, geocode4_corr, cityen_correct,cic, ownership, output, employment, fa_net "
    "FROM China.asif_firm_china "
    "WHERE year > 1997 AND year < 2009"
)

df_asif_firm_china = gcp.upload_data_from_bigquery(query=query, location="US")
df_asif_firm_china.head()

## Step 1: Clean up switching firms

1. Keep corresponding year defined in the parameter part 
2. Create SOE vs Private

- Clean up:
    - multi ownership
    - city
    - industry

In [None]:
df_no_switch = (
    df_asif_firm_china
    .assign(SOE=lambda x: np.where(x["ownership"] == "SOE", "SOE", "Private"))
)

for x in ['SOE', 'cityen_correct']:

    index_ = df_no_switch.groupby(
        'newID')[x].nunique().loc[lambda x: x > 1].index

    df_no_switch = df_no_switch[~df_no_switch['newID'].isin(
        index_)]

    print('There is {} firms switching {}'.format(len(index_), x))


## Step 2: Remove single year

We exclude the firms appearing only for one year

In [None]:
single_year = (df_no_switch
 .groupby(['newID'])['newID']
 .count()
 .loc[lambda x: x ==1]
).index
single_year.shape
df_multi_year = df_no_switch[~df_no_switch['newID'].isin(single_year)]
df_multi_year.shape

## Step 3: Compute additional variables

We compute age and investment. Investment is the difference between $\text{fa_net}_t - \text{fa_net}_{t-1}$

In [None]:
df_final = (df_multi_year
           .loc[lambda x:
                (x['fa_net'] >
                 x['fa_net'].quantile([.05, .98]).loc[(0.05)]) &
                (x['fa_net'] <
                    x['fa_net'].quantile([.05, .98]).loc[(0.98)])
                ]
           .loc[lambda x:
                (x['employment'] >
                 x['employment'].quantile([.05, .98]).loc[(0.05)]) &
                (x['employment'] <
                 x['employment'].quantile([.05, .98]).loc[(0.98)])
                ]
           .loc[lambda x:
                (x['output'] >
                 x['output'].quantile([.05, .98]).loc[(0.05)]) &
                (x['output'] <
                 x['output'].quantile([.05, .98]).loc[(0.98)])
                ]
           .loc[lambda x: (x['bdat'] > 1800) & (x['bdat'] < 2008)]
           .assign(age = lambda x: x['year'] - x['bdat'])
           .loc[lambda x: (x['age'] > 0)] 
           )
df_final['invesment'] = df_drop.groupby("newID")['fa_net'].shift(1)
df_final = df_final.dropna()
df_final.head()

In [None]:
df_final.describe()

# Upload to cloud

The dataset is ready to be shared with your colleagues. 

### Move to GCS and BigQuery

We move the dataset to the following:

- **bucket**: *NEED TO DEFINE*

- **Destination_blob**: *XXXXX/Processed_*
- **name**:  *SBC_pollution_china.gz*
- **Dataset**: *China*

- **table**: *SBC_pollution_china*

### GCS

We first need to save *SBC_pollution_china* with `.gz` extension locally then we can move it
to GCS

In [None]:
bucket_name = 'chinese_data'
destination_blob_name = 'Panel_china/Asif_panel_china/Processed_/asif_for_tfp.gz'

gcp.delete_blob(bucket_name = bucket_name,
                destination_blob_name= destination_blob_name)
gcp.delete_table(dataset_name = 'China', name_table = 'asif_for_tfp')

In [None]:

### First save locally
df_final.to_csv(
	'asif_for_tfp.gz',
	sep=',',
	header=True,
	index=False,
	chunksize=100000,
	compression='gzip',
	encoding='utf-8')

### Then upload to GCS
bucket_name = 'chinese_data'
destination_blob_name = 'Panel_china/Asif_panel_china/Processed_'
source_file_name = 'asif_for_tfp.gz'
gcp.upload_blob(bucket_name, destination_blob_name, source_file_name)

In [None]:
import os
os.remove(source_file_name)