In [1]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StringType
import docker

import requests

import logging
import os
import subprocess
import sys

import shutil

from collections import defaultdict

In [2]:
# Start Spark session
spark = (SparkSession
         .builder
         .appName("Handle new project pipline")
         .getOrCreate())

22/06/21 17:16:36 WARN Utils: Your hostname, UNIT-1700 resolves to a loopback address: 127.0.1.1; using 192.168.0.104 instead (on interface wlp59s0)
22/06/21 17:16:36 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/06/21 17:16:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
def create_dir(dir_path: str):
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)

In [4]:
def remove_dir(dir_path: str):
    shutil.rmtree(dir_path)

In [5]:
PATH_TO_CLONE_REPO = "/home/Dmitry.Pogrebnoy/Desktop/tmp_folder"
GIT_CLONE_LINK="https://github.com/zuevmaxim/itmo-ibd.git"
PROJECT_OWNER = GIT_CLONE_LINK.split("/")[-2]
PROJECT_NAME = GIT_CLONE_LINK.split("/")[-1].split(".git")[0]
PROJECT_PATH = os.path.join(PATH_TO_CLONE_REPO, PROJECT_NAME)

In [6]:
create_dir(PROJECT_PATH)

In [7]:
#First of all clone repo
p = subprocess.Popen(['git', 'clone', GIT_CLONE_LINK, PROJECT_PATH, '--depth', '1'])
return_code = p.wait()
if return_code != 0:
    logging.info(f'Error while cloning {GIT_CLONE_LINK}!')
    exit(1)

fatal: destination path '/home/Dmitry.Pogrebnoy/Desktop/tmp_folder/itmo-ibd' already exists and is not an empty directory.


In [8]:
#Compute extensions metrics
cont_extensions = defaultdict(int)
for root, _, files in os.walk(PROJECT_PATH):
    for filename in files:
        extension = os.path.splitext(filename)[1]
        cont_extensions[extension] += 1
extensions_metrics = []
for extension, count in cont_extensions.items():
    extensions_metrics.append((f"{PROJECT_NAME}" ,extension, count))

In [9]:
extensions_metrics_dataset = spark.createDataFrame(extensions_metrics).toDF(*["project_name", "extension", "count"]).cache()

In [10]:
extensions_metrics_dataset.show()

                                                                                

+------------+---------+-----+
|project_name|extension|count|
+------------+---------+-----+
|    itmo-ibd|         |   85|
|    itmo-ibd|      .md|    4|
|    itmo-ibd|      .py|   10|
|    itmo-ibd|   .ipynb|    2|
|    itmo-ibd|  .sample|   12|
|    itmo-ibd|      .sh|    1|
|    itmo-ibd|     .csv|   31|
+------------+---------+-----+



In [11]:
def rename_extension(package_name):
    return f"extension#{package_name}"

udf_rename_extension = F.udf(rename_extension, returnType=StringType())

In [12]:
extensions_metrics_dataset = extensions_metrics_dataset.select("project_name", udf_rename_extension("extension").alias("extension"), "count")
extensions_metrics_dataset.show()

+------------+-----------------+-----+
|project_name|        extension|count|
+------------+-----------------+-----+
|    itmo-ibd|       extension#|   85|
|    itmo-ibd|    extension#.md|    4|
|    itmo-ibd|    extension#.py|   10|
|    itmo-ibd| extension#.ipynb|    2|
|    itmo-ibd|extension#.sample|   12|
|    itmo-ibd|    extension#.sh|    1|
|    itmo-ibd|   extension#.csv|   31|
+------------+-----------------+-----+



In [13]:
# generate unique folders every time
PATH_TO_LUPA_KOTLIN_OUTPUT = "/home/Dmitry.Pogrebnoy/Desktop/tmp_lupa_kotlin_output"
PATH_TO_LUPA_PYTHON_OUTPUT = "/home/Dmitry.Pogrebnoy/Desktop/tmp_lupa_python_output"

In [14]:
create_dir(PATH_TO_LUPA_KOTLIN_OUTPUT)
create_dir(PATH_TO_LUPA_PYTHON_OUTPUT)

In [15]:
docker_volumes= {
    f'{PATH_TO_CLONE_REPO}' : {'bind' : '/data', 'mode' : 'ro'},
    f'{PATH_TO_LUPA_PYTHON_OUTPUT}' : {'bind' : '/output_python', 'mode' : 'rw'},
    f'{PATH_TO_LUPA_KOTLIN_OUTPUT}' : {'bind' : '/output_kotlin', 'mode' : 'rw'}
}

In [248]:
# run lupa docker to extract imports
docker_client = docker.from_env()
docker_client.containers.run('pogrebnoy/ibd-lupa-extract-imports:1.0.0',
                                         auto_remove=True,
                                         #user=f"{os.getuid()}", # Fails lupa with Exception in thread "main" java.lang.RuntimeException: Could not create parent directory for lock file /Lupa/?/.gradle/wrapper/dists/gradle-6.8.3-bin/7ykxq50lst7lb7wx1nijpicxn/gradle-6.8.3-bin.zip.lck
                                         stderr=True,
                                         volumes=docker_volumes)

KeyboardInterrupt: 

In [16]:
# Gathering all imports data to one dataset
python_imports_dataset = spark.read.csv(os.path.join(PATH_TO_LUPA_PYTHON_OUTPUT, "import_statements_data.csv"), header=True).cache()
python_imports_dataset.show()

+------------+--------------------+
|project_name|              import|
+------------+--------------------+
|    itmo-ibd|                 sys|
|    itmo-ibd|            requests|
|    itmo-ibd|             logging|
|    itmo-ibd|                  os|
|    itmo-ibd|          subprocess|
|    itmo-ibd|              pandas|
|    itmo-ibd|              shutil|
|    itmo-ibd|                json|
|    itmo-ibd|collections.defau...|
|project_name|              import|
|    itmo-ibd|                 sys|
|    itmo-ibd|            requests|
|    itmo-ibd|             logging|
|    itmo-ibd|                  os|
|    itmo-ibd|          subprocess|
|    itmo-ibd|              pandas|
|    itmo-ibd|              shutil|
|    itmo-ibd|                json|
|    itmo-ibd|collections.defau...|
+------------+--------------------+



In [17]:
kotlin_imports_dataset = spark.read.csv(os.path.join(PATH_TO_LUPA_KOTLIN_OUTPUT, "import_directives_data.csv"), header=True).cache()
kotlin_imports_dataset.show()

+------------+------+
|project_name|import|
+------------+------+
|project_name|import|
+------------+------+



In [18]:
imports_dataset = python_imports_dataset.union(kotlin_imports_dataset).cache()
imports_dataset.show()

+------------+--------------------+
|project_name|              import|
+------------+--------------------+
|    itmo-ibd|                 sys|
|    itmo-ibd|            requests|
|    itmo-ibd|             logging|
|    itmo-ibd|                  os|
|    itmo-ibd|          subprocess|
|    itmo-ibd|              pandas|
|    itmo-ibd|              shutil|
|    itmo-ibd|                json|
|    itmo-ibd|collections.defau...|
|project_name|              import|
|    itmo-ibd|                 sys|
|    itmo-ibd|            requests|
|    itmo-ibd|             logging|
|    itmo-ibd|                  os|
|    itmo-ibd|          subprocess|
|    itmo-ibd|              pandas|
|    itmo-ibd|              shutil|
|    itmo-ibd|                json|
|    itmo-ibd|collections.defau...|
|project_name|              import|
+------------+--------------------+



In [19]:
PATH_TO_IMPORT_TO_PACKAGE_DATASET="/home/Dmitry.Pogrebnoy/Desktop/itmo-ibd/data/full_import_dataset/lupa_import_grouping/output/import_by_package.csv"

In [20]:
import_to_package_dataset = spark.read.csv(PATH_TO_IMPORT_TO_PACKAGE_DATASET, header=True)
import_to_package_dataset.show()

+--------------------+------------------+
|              import|           package|
+--------------------+------------------+
|androidx.appcompa...|androidx.appcompat|
|androidx.appcompa...|androidx.appcompat|
|androidx.appcompa...|androidx.appcompat|
|androidx.appcompa...|androidx.appcompat|
|androidx.appcompa...|androidx.appcompat|
|androidx.appcompa...|androidx.appcompat|
|androidx.appcompa...|androidx.appcompat|
|androidx.appcompa...|androidx.appcompat|
|androidx.appcompa...|androidx.appcompat|
|androidx.appcompa...|androidx.appcompat|
|androidx.appcompa...|androidx.appcompat|
|androidx.appcompa...|androidx.appcompat|
|androidx.appcompa...|androidx.appcompat|
|androidx.appcompa...|androidx.appcompat|
|androidx.appcompa...|androidx.appcompat|
|androidx.appcompa...|androidx.appcompat|
|androidx.appcompa...|androidx.appcompat|
|androidx.appcompa...|androidx.appcompat|
|androidx.appcompa...|androidx.appcompat|
|androidx.appcompa...|androidx.appcompat|
+--------------------+------------

In [21]:
import_to_package_dataset = import_to_package_dataset.toPandas()
import_to_package_dict = dict(zip(import_to_package_dataset["import"], import_to_package_dataset["package"]))
import_to_package_dict

{'androidx.appcompat.app.AppCompatActivity': 'androidx.appcompat',
 'androidx.appcompat.app.ActionBarDrawerToggle': 'androidx.appcompat',
 'androidx.appcompat.app.AlertDialog': 'androidx.appcompat',
 'androidx.appcompat.app.AppCompatDelegate': 'androidx.appcompat',
 'androidx.appcompat.widget.Toolbar': 'androidx.appcompat',
 'androidx.appcompat.content.res.AppCompatResources': 'androidx.appcompat',
 'androidx.appcompat.view.ContextThemeWrapper': 'androidx.appcompat',
 'androidx.appcompat.widget.PopupMenu': 'androidx.appcompat',
 'androidx.appcompat.graphics.drawable.DrawerArrowDrawable': 'androidx.appcompat',
 'androidx.appcompat.app.AppCompatDialogFragment': 'androidx.appcompat',
 'androidx.appcompat.widget.AppCompatImageButton': 'androidx.appcompat',
 'androidx.appcompat.widget.AppCompatTextView': 'androidx.appcompat',
 'androidx.appcompat.widget.AppCompatSeekBar': 'androidx.appcompat',
 'androidx.appcompat.widget.SearchView': 'androidx.appcompat',
 'androidx.appcompat.widget.SwitchC

In [22]:
def get_package_by_import(lib_import):
    if lib_import in import_to_package_dict:
        return import_to_package_dict[lib_import]
    else:
        return lib_import

map_import_to_package = F.udf(get_package_by_import, returnType=StringType())

In [23]:
full_import_dataset = imports_dataset.select(
 "*", map_import_to_package("import").alias("package")
).cache()
full_import_dataset.show()

+------------+--------------------+-----------+
|project_name|              import|    package|
+------------+--------------------+-----------+
|    itmo-ibd|                 sys|        sys|
|    itmo-ibd|            requests|   requests|
|    itmo-ibd|             logging|    logging|
|    itmo-ibd|                  os|         os|
|    itmo-ibd|          subprocess| subprocess|
|    itmo-ibd|              pandas|     pandas|
|    itmo-ibd|              shutil|     shutil|
|    itmo-ibd|                json|       json|
|    itmo-ibd|collections.defau...|collections|
|project_name|              import|     import|
|    itmo-ibd|                 sys|        sys|
|    itmo-ibd|            requests|   requests|
|    itmo-ibd|             logging|    logging|
|    itmo-ibd|                  os|         os|
|    itmo-ibd|          subprocess| subprocess|
|    itmo-ibd|              pandas|     pandas|
|    itmo-ibd|              shutil|     shutil|
|    itmo-ibd|                json|     

In [24]:
# Make final dataset

In [25]:
intermediate_dataframe = (full_import_dataset.select("*")
                          .groupby(['project_name', 'package'])
                          .agg(F.count("*").alias("count_different_import")))

In [26]:
intermediate_dataframe.show()

+------------+-----------+----------------------+
|project_name|    package|count_different_import|
+------------+-----------+----------------------+
|    itmo-ibd|       json|                     2|
|    itmo-ibd|     shutil|                     2|
|    itmo-ibd|        sys|                     2|
|    itmo-ibd|         os|                     2|
|    itmo-ibd|collections|                     2|
|    itmo-ibd|     pandas|                     2|
|    itmo-ibd|   requests|                     2|
|    itmo-ibd| subprocess|                     2|
|    itmo-ibd|    logging|                     2|
|project_name|     import|                     2|
+------------+-----------+----------------------+



In [27]:
def rename_package(package_name):
    return f"package#{package_name}"

udf_rename_package = F.udf(rename_package, returnType=StringType())

In [28]:
intermediate_dataframe = intermediate_dataframe.select(
 "project_name", udf_rename_package("package").alias("package")
).cache()
intermediate_dataframe.show()

                                                                                

+------------+-------------------+
|project_name|            package|
+------------+-------------------+
|    itmo-ibd|       package#json|
|    itmo-ibd|     package#shutil|
|    itmo-ibd|        package#sys|
|    itmo-ibd|         package#os|
|    itmo-ibd|package#collections|
|    itmo-ibd|     package#pandas|
|    itmo-ibd|   package#requests|
|    itmo-ibd| package#subprocess|
|    itmo-ibd|    package#logging|
|project_name|     package#import|
+------------+-------------------+



In [29]:
pivot_package_dataframe = intermediate_dataframe.groupby("project_name").pivot("package").agg(F.count("*"))
pivot_package_dataframe.show()

+------------+-------------------+--------------+------------+---------------+----------+--------------+----------------+--------------+------------------+-----------+
|project_name|package#collections|package#import|package#json|package#logging|package#os|package#pandas|package#requests|package#shutil|package#subprocess|package#sys|
+------------+-------------------+--------------+------------+---------------+----------+--------------+----------------+--------------+------------------+-----------+
|project_name|               null|             1|        null|           null|      null|          null|            null|          null|              null|       null|
|    itmo-ibd|                  1|          null|           1|              1|         1|             1|               1|             1|                 1|          1|
+------------+-------------------+--------------+------------+---------------+----------+--------------+----------------+--------------+------------------+-----

In [30]:
pivot_ext_count_dataset = extensions_metrics_dataset.groupby("project_name").pivot("extension").agg(F.first("count"))
pivot_ext_count_dataset.show()

+------------+----------+--------------+----------------+-------------+-------------+-----------------+-------------+
|project_name|extension#|extension#.csv|extension#.ipynb|extension#.md|extension#.py|extension#.sample|extension#.sh|
+------------+----------+--------------+----------------+-------------+-------------+-----------------+-------------+
|    itmo-ibd|        85|            31|               2|            4|           10|               12|            1|
+------------+----------+--------------+----------------+-------------+-------------+-----------------+-------------+



In [31]:
final_dataset = pivot_package_dataframe.join(pivot_ext_count_dataset, ["project_name"])
final_dataset.show()

+------------+-------------------+--------------+------------+---------------+----------+--------------+----------------+--------------+------------------+-----------+----------+--------------+----------------+-------------+-------------+-----------------+-------------+
|project_name|package#collections|package#import|package#json|package#logging|package#os|package#pandas|package#requests|package#shutil|package#subprocess|package#sys|extension#|extension#.csv|extension#.ipynb|extension#.md|extension#.py|extension#.sample|extension#.sh|
+------------+-------------------+--------------+------------+---------------+----------+--------------+----------------+--------------+------------------+-----------+----------+--------------+----------------+-------------+-------------+-----------------+-------------+
|    itmo-ibd|                  1|          null|           1|              1|         1|             1|               1|             1|                 1|          1|        85|         

In [34]:
final_dataset_dict = final_dataset.collect()[0].asDict(True)
final_dataset_dict

{'project_name': 'itmo-ibd',
 'package#collections': 1,
 'package#json': 1,
 'package#logging': 1,
 'package#os': 1,
 'package#pandas': 1,
 'package#requests': 1,
 'package#shutil': 1,
 'package#subprocess': 1,
 'package#sys': 1,
 'extension#': 85,
 'extension#.csv': 31,
 'extension#.ipynb': 2,
 'extension#.md': 4,
 'extension#.py': 10,
 'extension#.sample': 12,
 'extension#.sh': 1}

In [25]:
PATH_TO_COLUMN_DATASET = "/home/Dmitry.Pogrebnoy/Desktop/itmo-ibd/data/pipeline/final_columns.csv"

In [60]:
columns_dataset = spark.read.csv(PATH_TO_COLUMN_DATASET, header=True).toPandas()["column_name"].to_list()
final_data_for_prediction = []
for item in columns_dataset:
    if item in final_dataset_dict:
        final_data_for_prediction.append(final_dataset_dict.get(item))
    else:
        final_data_for_prediction.append(0)

In [62]:
final_data_for_prediction

['itmo-ibd',
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,

In [66]:
# Not all zeros
sum(final_data_for_prediction[1:])

69

In [None]:
# Then we should pass the data to predictor and that's it

In [None]:
# bla bla bla

In [None]:
PATH_TO_TAG_DATASET = "/home/Dmitry.Pogrebnoy/Desktop/itmo-ibd/data/pipeline/final_tags.csv"

In [None]:
tags_dataset = spark.read.csv(PATH_TO_COLUMN_DATASET, header=True).toPandas()["tag_name"].to_list()

In [None]:
print(tags_dataset[0])
print(tags_dataset[10])
print(tags_dataset[20])