In [39]:
import os
import math
import random
import csv
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px



In [31]:
MIN_STAR = 10
MIN_CITATION = 10
MAX_STAR = 10000
MAX_CITATION = 10000

In [32]:
def read_csv(csv_path):
    with open(csv_path, newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='|')
        csv_data_with_header = [row[:1] + row[-5:] for row in reader]
        _csv_header_row = csv_data_with_header[0]
        csv_raw_data = csv_data_with_header[1:]

    # 정수 데이터가 str로 저장되어 있는 것들을 int로 변환
    csv_data = []
    for sample in csv_raw_data:
        csv_data.append(list(map(int, sample)))

    # index 추출
    csv_header = dict()
    for idx, elem in enumerate(_csv_header_row):
        csv_header[elem] = idx

    return csv_header, np.array(csv_data)

In [33]:
def load_dataset(csv_path_, exclude_outlier=False):
    csv_data = np.empty([0, 6])
    _, _csv_data = read_csv(csv_path_)
    csv_data = np.concatenate((csv_data, _csv_data), axis=0)
    csv_data = np.asarray(csv_data)

    outlier_indices = np.logical_and.reduce((
        csv_data[:, 1] <= MAX_STAR,
        csv_data[:, 1] >= MIN_STAR,
        csv_data[:, -1] <= MAX_CITATION,
        csv_data[:, -1] >= MIN_CITATION
        ))

    outlier_cleaned_array = csv_data[outlier_indices, :]
    return outlier_cleaned_array

In [34]:
def as_df(array_data, year):
    array_data_log10 = np.log10(array_data[:, 1:] + 1)
    df = pd.DataFrame(array_data_log10, columns = ['Stars', 'Watchers', 'Forks', 'Issues', 'Citations'])
    df['Year'] = year
    return df

### Data Loading

In [35]:
years = ['2018', '2019', '2020', '2021']
csv_path_with_year = (lambda y: Path(f"{y}_final_dataset.csv"))
csv_path_list = [csv_path_with_year(_x) for _x in years]

df_dict = dict()
for year, _csv_path in zip(years, csv_path_list):
    data_as_array = load_dataset(_csv_path)
    df_dict[year] = as_df(data_as_array, year)
    
df_all = pd.concat(df_dict.values()) 

In [36]:
# csv_data = np.empty([0, 6])
# for _csv_path in csv_path_list:
#     _, _csv_data = read_csv(_csv_path)
#     csv_data = np.concatenate((csv_data, _csv_data), axis=0)
# csv_data = np.asarray(csv_data)

# print(csv_data[:3])

# outlier_indices = np.logical_and.reduce((
#     csv_data[:, 1] <= MAX_STAR,
#     csv_data[:, 1] >= MIN_STAR,
#     csv_data[:, -1] <= MAX_CITATION,
#     csv_data[:, -1] >= MIN_CITATION
#     ))

# outlier_csv_data = csv_data[outlier_indices, :]

# print(outlier_csv_data[:3])

In [38]:
fig = px.scatter(df_all, x='Stars', y='Citations', marginal_x="rug", marginal_y="histogram")
fig.show()

In [40]:
fig = px.scatter(df_all, x='Stars', y='Citations', color='Year', marginal_x="rug", marginal_y="histogram")
fig.show()

In [44]:
target_year = str(2018)
fig = px.scatter(df_all.loc[df_all['Year'] == target_year],
                 x='Stars', y='Citations', color='Year', marginal_x="rug", marginal_y="histogram")
fig.show()

In [51]:
fig = px.scatter_3d(df_all, x='Stars', y='Forks', z='Citations',
                    color='Year', size='Citations', size_max=20)
fig.show()

In [52]:
target_year = str(2018)
fig = px.scatter_3d(df_all.loc[df_all['Year'] == target_year],
                    x='Stars', y='Forks', z='Citations',
                    color='Year', size='Citations', size_max=20)
fig.show()
