<a href="https://colab.research.google.com/github/yyyyyokoko/AI-class/blob/master/data_characterization/MAG/Luwei's_MAG_FieldOfStudyChildren_Report.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#@title
import pandas as pd
import re
import numpy as np
import plotly.io as pio
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

from datetime import date
from google.cloud import bigquery
from IPython.display import display, HTML
from google.colab import auth

auth.authenticate_user()
client = bigquery.Client(project="gcp-cset-projects")

In [2]:
FieldOfStudyChildren = pd.read_gbq("select * from gcp_cset_mag.FieldOfStudyChildren", dialect="standard", project_id="gcp-cset-projects")

In [None]:
FieldOfStudyChildren

Unnamed: 0,FieldOfStudyId,ChildFieldOfStudyId,import_time
0,87687168,69160334,2020-06-13 13:30:02
1,87687168,173391809,2020-06-13 13:30:02
2,87687168,182292516,2020-06-13 13:30:02
3,87687168,182358804,2020-06-13 13:30:02
4,87687168,193357580,2020-06-13 13:30:02
...,...,...,...
956843,87621631,2777253480,2020-06-13 13:30:02
956844,87621631,2908781348,2020-06-13 13:30:02
956845,87621631,2910473673,2020-06-13 13:30:02
956846,87621631,2989150663,2020-06-13 13:30:02


In [3]:
finalDF = pd.DataFrame(columns=['Colname', 'Has Null?', 'Possible Null Values', '# of Null Values', 'Has Unique Values?', 'Original Data Type', 
                                'Suggested Data Type', 'Note', 'Normalization Ideas'])
finalDF.Colname = FieldOfStudyChildren.columns[:-1]
finalDF['Possible Null Values'] = ''
finalDF['# of Null Values'] = 0
finalDF['Note'] = ''
finalDF['Normalization Ideas'] = ''  

In [4]:
def checkUniqueness(df):
  nrow = len(df)
  print("Total number of rows: ", nrow)
  for i in df.columns[:-1]:
    a = len(df[i].unique())
    if a == nrow:
      print("Unique:", i)
      finalDF.loc[finalDF.Colname == i, 'Has Unique Values?'] = 'Yes'
    else:
      print('Number of unique values of', i, 'is:', sum(df[i].value_counts() == 1))
      finalDF.loc[finalDF.Colname == i, 'Has Unique Values?'] = 'No'
  return
checkUniqueness(FieldOfStudyChildren)

Total number of rows:  956848
Number of unique values of FieldOfStudyId is: 20743
Number of unique values of ChildFieldOfStudyId is: 302653


In [5]:
#check NA
def checkNA(df, finalDF):
  #lst = ['na', 'NA', 'tbd', 'TBD', 'n/a', 'N/A', 'Null', 'null', 'NULL', 'none', 'None', 'NONE', 'tba', 'TBA']
  for i in df.columns[:-1]:
    #np.nan
    real_na = sum(df[i].isnull())
    if real_na == 0:
      finalDF.loc[finalDF.Colname == i, 'Has Null?'] = 'No'
    else:
      finalDF.loc[finalDF.Colname == i, 'Possible Null Values'] = 'np.nan'
      finalDF.loc[finalDF.Colname == i, 'Has Null?'] = 'Yes'
      finalDF.loc[finalDF.Colname == i, '# of Null Values'] += real_na
  finalDF['Has Null?'] = finalDF['Has Null?'].fillna('No')
  return finalDF
finalDF = checkNA(FieldOfStudyChildren, finalDF)

In [6]:
finalDF['Suggested Data Type'] = ['STRING', 'STRING']
finalDF['Original Data Type'] = ['INTEGER', 'INTEGER'] 

## **Report Overview**

In [7]:
finalDF

Unnamed: 0,Colname,Has Null?,Possible Null Values,# of Null Values,Has Unique Values?,Original Data Type,Suggested Data Type,Note,Normalization Ideas
0,FieldOfStudyId,No,,0,No,INTEGER,STRING,,
1,ChildFieldOfStudyId,No,,0,No,INTEGER,STRING,,


1. A child field of study can have more than one parent.
2. Most of the **FieldOfStudyId** appeared 1 to 10 times in the dataset, but there are some outliers such as botany, ecology that appeared more than 20,000 times.
3. Most of the **ChildFieldOfStudyId** have 1 to 10 parent but there are some child have more than 100 parents. These values are graphed below.

## **Graphs**

In [None]:
#distribution of FieldOfStudyId after binning
binned_data_string = "select c_bin, count(c_bin) as num_names from (\
  select cast(round(count(NormalizedName)/10.0)*10 as int64) as c_bin from \
  (select a.NormalizedName, b.FieldOfStudyId from gcp_cset_mag.FieldsOfStudy a join gcp_cset_mag.FieldOfStudyChildren b on a.FieldOfStudyId = b.FieldOfStudyId) \
   group by NormalizedName order by c_bin desc\
) group by c_bin order by c_bin"
binned_data = pd.read_gbq(binned_data_string, dialect="standard", project_id="gcp-cset-projects")


In [None]:
binned_data

Unnamed: 0,c_bin,num_names
0,0,37018
1,10,10831
2,20,2935
3,30,1366
4,40,785
...,...,...
168,10320,1
169,10970,1
170,13800,1
171,20310,1


In [None]:
data = [go.Bar(x = binned_data['c_bin'], y = binned_data['num_names'], hovertext = binned_data)]
fig_name = "Bar chart of counts(bin) of FieldOfStudyId"
layout = go.Layout(xaxis=dict(type='category',title = "ChildFieldOfStudyId bin"),title= fig_name, yaxis=dict(title = 'Count'))
fig = go.Figure(data=data, layout=layout)
fig.show()

In [None]:
#Outliears of FieldOfStudyId
temp = pd.read_gbq("select a.NormalizedName, b.FieldOfStudyId from gcp_cset_mag.FieldsOfStudy a join gcp_cset_mag.FieldOfStudyChildren b on a.FieldOfStudyId = b.FieldOfStudyId", dialect="standard", project_id="gcp-cset-projects")
temp = temp.NormalizedName.value_counts()[0:20].reset_index()
temp.columns = ['NormalizedName', 'Counts']
temp['NormalizedName'] = temp['NormalizedName'].astype(str)
fig = px.bar(temp, x='NormalizedName', y='Counts', text='Counts', title = 'Bar Chart of Top 20 Most Frequent FieldOfStudyId of FieldOfStudyChildren')
fig.update_layout(xaxis={'type': 'category'})
fig.show()

In [None]:
#distribution of ChildFieldOfStudyId after binning
binned_data_string = "select c_bin, count(c_bin) as num_names from (\
  select cast(round(count(NormalizedName)/10.0)*10 as int64) as c_bin from \
  (select a.NormalizedName, b.ChildFieldOfStudyId from gcp_cset_mag.FieldsOfStudy a join gcp_cset_mag.FieldOfStudyChildren b on a.FieldOfStudyId = b.ChildFieldOfStudyId) \
   group by NormalizedName order by c_bin desc\
) group by c_bin order by c_bin"
binned_data = pd.read_gbq(binned_data_string, dialect="standard", project_id="gcp-cset-projects")


In [None]:
binned_data

Unnamed: 0,c_bin,num_names
0,0,535809
1,10,9803
2,20,13
3,30,2
4,50,1
5,60,1
6,70,2
7,80,4
8,90,6
9,100,3


In [None]:
data = [go.Bar(x = binned_data['c_bin'], y = binned_data['num_names'], hovertext = binned_data)]
fig_name = "Bar chart of counts(bin) of ChildFieldOfStudyId"
layout = go.Layout(xaxis=dict(type='category',title = "ChildFieldOfStudyId bin"),title= fig_name, yaxis=dict(title = 'Count'))
fig = go.Figure(data=data, layout=layout)
fig.show()

In [None]:

#Outliears of FieldOfStudyId
temp = pd.read_gbq("select a.NormalizedName, b.ChildFieldOfStudyId from gcp_cset_mag.FieldsOfStudy a join gcp_cset_mag.FieldOfStudyChildren b on a.FieldOfStudyId = b.ChildFieldOfStudyId", dialect="standard", project_id="gcp-cset-projects")
temp = temp.NormalizedName.value_counts()[0:20].reset_index()
temp.columns = ['NormalizedName', 'Counts']
temp['NormalizedName'] = temp['NormalizedName'].astype(str)
fig = px.bar(temp, x='NormalizedName', y='Counts', text='Counts', title = 'Bar Chart of Top 20 Most Frequent ChildFieldOfStudyId of FieldOfStudyChildren')
fig.update_layout(xaxis={'type': 'category'})
fig.show()