In [2]:
import numpy as np
import pandas as pd
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql import functions as f

spark = SparkSession.builder.enableHiveSupport().getOrCreate()

# Suppresses user warinig messages in Python
import warnings
warnings.simplefilter("ignore", UserWarning)

# Suppresses `WARN` messages in JVM
spark.sparkContext.setLogLevel("ERROR")

In [7]:
import importlib
ptesting_model = importlib.import_module("ptesting-model")

In [6]:
expected_input_cols = [                                                                                                                                                                                                                       
    'author',
    'case when length(sha) > 0 then sha else sha(string(random())) end sha',
    'commit_date',
    'array_distinct(failed_tests) failed_tests',
    'files' 
]

log_data_df = spark.read.format('json').load('../models/spark/logs/github-logs.json').selectExpr(expected_input_cols)

In [8]:
log_data_df.show()

+------------------+--------------------+-------------------+--------------------+--------------------+
|            author|                 sha|        commit_date|        failed_tests|               files|
+------------------+--------------------+-------------------+--------------------+--------------------+
|       ChenMichael|f186c231aa504fe9e...|2021/09/17 16:32:01|                  []|[{{2, 3, 1, sql/c...|
|          f-thiele|4cb0c3c6fc3e8a451...|2021/09/16 15:24:44|[pyspark.mllib.te...|[{{11, 17, 6, cor...|
|            viirya|8db8b50e0621b46e6...|2021/09/22 18:49:34|[org.apache.spark...|[{{19, 23, 4, sql...|
|            viirya|82ccaf18d64f46ffe...|2021/09/20 21:46:52|[org.apache.spark...|[{{1, 2, 1, sql/c...|
|            viirya|ba4172076f3f80305...|2021/07/04 06:37:05|[pyspark.pandas.t...|[{{1, 1, 0, sql/c...|
|            viirya|f9ae525cca0197546...|2021/04/15 01:29:22|                  []|[{{3, 6, 3, sql/c...|
|         karenfeng|916cef25586e084d3...|2021/06/09 21:22:58|   

In [11]:
import json
from pathlib import Path 
excluded_tests = json.loads(Path('../models/spark/logs/excluded-tests.json').read_text())
log_data_df = ptesting_model._exclude_tests_from(log_data_df, excluded_tests)

In [12]:
from ptesting import github_utils

test_files = json.loads(Path('../models/spark/indexes/latest/test-files.json').read_text())
commits = json.loads(Path('../models/spark/logs/commits.json').read_text())
commits = list(map(lambda c: github_utils.from_github_datetime(c[0]), commits))
updated_file_stats = json.loads(Path('../models/spark/logs/updated-file-stats.json').read_text())
failed_tests = json.loads(Path('../models/spark/failed-tests.json').read_text())
contributor_stats = json.loads(Path('../models/spark/logs/contributor-stats.json').read_text())
dep_graph = json.loads(Path('../models/spark/indexes/latest/dep-graph.json').read_text())

to_train_features, _ = ptesting_model._create_train_test_pipeline(spark, test_files, commits, dep_graph, updated_file_stats, contributor_stats, failed_tests)
pdf = to_train_features.__call__(log_data_df).cache().toPandas()

In [13]:
from ptesting import train

X = pdf[pdf.columns[pdf.columns != 'failed']]
y = pdf['failed']
X, y = train.rebalance_training_data(X, y, coeff=1.0)
X['failed'] = y

2021-10-19 16:26:58.410 INFO train: Sampling training data (strategy={0: 861, 1: 861}): {0: 117277, 1: 861} => {0: 861, 1: 861}


In [14]:
X.describe()

Unnamed: 0,num_commits,updated_num_3d,updated_num_14d,updated_num_56d,updated_num_3c,updated_num_14c,updated_num_56c,num_adds,num_dels,num_chgs,...,failed_num_7d,failed_num_14d,failed_num_28d,failed_num_7c,failed_num_14c,failed_num_28c,total_failed_num,path_difference,distance,failed
count,1722.0,1722.0,1722.0,1722.0,1722.0,1722.0,1722.0,1722.0,1722.0,1722.0,...,1722.0,1722.0,1722.0,1722.0,1722.0,1722.0,1722.0,1722.0,1722.0,1722.0
mean,100.853659,3.343206,8.555168,16.335075,0.321719,0.506969,4.003484,1370.207317,247.558653,1617.76597,...,0.528455,0.566783,0.630662,0.004646,0.009292,0.023229,1.265389,3.788618,55.521487,0.5
std,144.541804,3.908911,7.732349,13.011318,1.971288,2.06214,3.67854,10925.039849,1039.907752,11155.761731,...,0.556565,0.60061,0.69157,0.068021,0.1074,0.168858,1.498734,4.925774,62.492522,0.500145
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,7.0,0.0,0.0,1.0,48.0,7.0,55.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,0.0
50%,83.0,1.0,6.0,19.0,0.0,0.0,3.0,80.0,9.0,91.0,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,4.0,2.0,0.5
75%,83.0,8.0,18.0,20.0,0.0,0.0,8.0,179.0,52.0,236.0,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,6.0,128.0,1.0
max,887.0,29.0,29.0,100.0,29.0,29.0,29.0,130829.0,9386.0,132030.0,...,3.0,4.0,7.0,1.0,2.0,3.0,12.0,128.0,128.0,1.0


In [15]:
import altair as alt

charts = []

for c in X.columns:
    charts.append(alt.Chart(X).mark_bar().encode(x=alt.X(c), y=alt.Y('count()', axis=alt.Axis(title='freq'))).properties(width=300, height=300))

alt.hconcat(*charts)

In [16]:
import altair as alt

target = 'distance'
target_min, target_max = 0, 10

alt.Chart(X[X[target].between(target_min, target_max)]).mark_bar().encode(
    x=alt.X(target, scale=alt.Scale(domain=[target_min, target_max])),
    y=alt.Y('count()', axis=alt.Axis(title='freq'))
).properties(width=400, height=400)

In [17]:
import altair as alt

x_name = 'num_chgs'
x_min, x_max = 0, 4000

y_name = 'num_commits'
y_min, y_max = 0, 1000

x_axis = alt.X(x_name, scale=alt.Scale(domain=[x_min, x_max]))
y_axis = alt.X(y_name, scale=alt.Scale(domain=[y_min, y_max]))
color=alt.Color('failed:N', scale=alt.Scale(range=['blue', 'red']))

alt.Chart(X[X[x_name].between(x_min, x_max) & X[y_name].between(y_min, y_max)]) \
    .mark_point().encode(x=x_axis, y=y_axis, color=color).properties(width=600, height=400).interactive()

In [18]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
_X = X[['total_failed_num', 'path_difference', 'updated_num_56d', 'num_commits']]
pca.fit(_X)

import altair as alt

alt.Chart(pd.DataFrame(pca.transform(_X), columns=['c0', 'c1'])) \
    .mark_point().encode(x='c0', y='c1').properties(width=600, height=400).interactive()