In [8]:
from src.utils.util import *
from src.constants.github_data_constants import *
from src.constants import *
import os

In [9]:

def read_statewise_usersdata(self )-> pd.DataFrame:
    return read_from_csv(STATE_WISE_USERSDATA_FILE_NAME)

def filter_repository_data(self, repository) -> pd.DataFrame:
    
    test =  [
        {
        "full_repository_name": repo.get('full_name', None),
        "name": repo.get('name', None),
        "user_type":repo.get('owner', {}).get('type', None),

        "created_at":repo.get('created_at', None),
        "pushed_at":repo.get('pushed_at', None),
        "updated_at":repo.get('updated_at', None),

        "default_branch":repo.get('default_branch', None),
        "description":repo.get('description', None),

        "open_issues":repo.get('open_issues', None),
        "open_issues_count":repo.get('open_issues_count', None),

        "visibility":repo.get('visibility', None),
        "watchers":repo.get('watchers', None),
        "watchers_count":repo.get('watchers_count', None),

        "is_disabled":repo.get('disabled', False),
        "is_archieved":repo.get('archived', False),
        "languages_url":repo.get('languages_url', None),
        "contributors_url":repo.get('contributors_url', None)

        } 
        for repo in repository if repo is not None]
    return create_data_frame(test)

def get_contributors_count(self, df, col) ->list:
    count = []
    if col in df.columns:
        for url in df[col]:
                if url:
                    res = read_from_url(url)
                    if res.status_code == 200:
                        res = res.json()
                        if res:
                            if res and isinstance(res, list) and 'contributions' in res[0]:
                                    count.append(res[0]['contributions'])
                            else:
                                    count.append(0)
                        else:
                            count.append(0)
                    elif res.status_code == 403: break
                else: count.append(0)
    while len(count) < len(df):
        count.append(0)
    return count

def get_languages_count(self, df, col):
    temp = []
    if col in df.columns:
        for url in df[col]:
            res = read_from_url(url)
            if res.status_code == 200:
                res = res.json()
                if res:
                    temp.append(res)

            elif res.status_code == 403: break

    return create_data_frame(temp)

def merge_dataframes_row_col_wise(self, df1, df2):

    t3 = df2.T
    t3.reset_index(inplace=True)
    if 'index' in t3.columns:
        t3.rename(columns={'index': 'language'}, inplace=True)

    languages_long = t3.melt(id_vars='language', var_name='repo_index', value_name='language_count')
    languages_long['repo_index'] = languages_long['repo_index'].astype(int)

    df1 = df1.reset_index().rename(columns={'index': 'index_column'})
    
    merged_df = pd.merge(df1, languages_long, left_on='index_column', right_on='repo_index')

    merged_df.drop('index_column', axis=1, inplace=True)
    merged_df.drop('repo_index', axis=1, inplace=True)
    print("merging inside the function....DONE")

    current_time_stamp = get_current_time_stamp()

    dir_path = os.path.join(ROOT_DIR, DATASET_LOCATION, EXTERNAL_DATASET_DIRECTORY_NAME)

    os.makedirs(dir_path, exist_ok=True)

    file_name = STATE_WISE_USERSDATA_FILE_NAME

    write_to_csv(df=merged_df, file_path=join_paths(path1=dir_path, path2=file_name))
    return merged_df

def fetch_repos(self, username, state):
    
    ## Input: User Name, State Name
    ## Output: DataFrame with all the required details
    ## Description:
    # This function invokes the  HTTP GET Requests method, filters the respone and finally returns the response in a Data Frame.                     

    repos = read_from_url(f"{USERS_URL}/{username}/repos")
    if repos.status_code == 200:
        repos = repos.json()
        repos.extend([None] * (30 - len(repos)))

        df = self.filter_repository_data(repository=repos)
        df['user name'] = [username] * len(df) 
        df['state'] = [state] * len(df)
        df['contributions_count'] = self.get_contributors_count(df=df, col="contributors_url")
        temp_df = self.get_languages_count(df, 'languages_url')
        merged_df = self.merge_dataframes_row_col_wise(df, temp_df)
        return merged_df
    elif repos.status_code == 403:
            return merged_df
    else:
        raise Exception(status_code=repos.status_code, detail="Error fetching repositories")


In [10]:
import numpy as np
import pandas as pd
import os

def stratified_sample(df, sample_size, random_state=None):
    state_proportions = df['state'].value_counts(normalize=True)
    state_sample_sizes = np.rint(state_proportions * sample_size).astype(int)

    def sample_from_group(group):
        state = group.name
        n_samples = state_sample_sizes[state]
        return group.sample(n=n_samples, random_state=random_state)

    return df.groupby('state', group_keys=False).apply(sample_from_group)

def sample_users_data_with_stratified_sampling(raw_data_file_path: str, random_state=None):
    data = read_from_csv(file_path=raw_data_file_path)
    total_records = len(data)
    samples = {percent: {} for percent in SAMPLE_SIZES_PERCENTAGES}

    for percent in SAMPLE_SIZES_PERCENTAGES:
        sample_size = int(total_records * percent)
        samples[percent] = stratified_sample(df=data, sample_size=sample_size, random_state=random_state)
    
    directory_location = os.path.join(SAMPLED_USERS_DATASET_FILE_PATH, CURRENT_TIME_STAMP)
    create_directories(directories_path=directory_location)
    processed_dataset_with_time_stamp = directory_location

    for percent in SAMPLE_SIZES_PERCENTAGES:
        path = os.path.join(processed_dataset_with_time_stamp, f"samples_of_size_{int(percent*100)}_percentage.csv")
        write_to_csv(df=samples[percent], file_path=path)

# Example usage
# sample_users_data_with_stratified_sampling("path_to_your_file.csv", random_state=None)


In [11]:
# sample_users_data_with_stratified_sampling("D:\\DM Project\\Testing\\GitHubDataRetrieval\\data\\raw\\state_wise_user_names.csv")


In [12]:
from concurrent.futures import ThreadPoolExecutor, as_completed

def fetch_and_update(user_name, state_name):
    try:
        df1 = fetch_repos(username=user_name, state=state_name)
        if df1 is None:  # Check if df1 is None and handle it
            df1 = pd.DataFrame()
    except Exception as e:
        print(f"Error fetching data for {user_name}: {e}")
        df1 = pd.DataFrame()  # Ensure df1 is a DataFrame even in case of an exception

    return state_name, df1

def download_users_data( sample_fraction, file_path: str):
    sample_users_data_with_stratified_sampling(raw_data_file_path=file_path)

    subdirectories = os.listdir(SAMPLED_USERS_DATASET_FILE_PATH)

    subdirectories.sort(reverse=True)
    print(f"subdirectories in the sampled users folder are: {subdirectories}")

    latest_subdir = subdirectories[0] if subdirectories else None
    filtered_file_name = ""
    try:
        if latest_subdir:
            search_path = os.path.join(SAMPLED_USERS_DATASET_FILE_PATH, latest_subdir, f"samples_of_size_{sample_fraction}_percentage.csv")
            
            for file in glob.glob(search_path):
                filtered_file_name = file
                break
        else:
            raise CustomException(f"No files found in {SAMPLED_USERS_DATASET_FILE_PATH} directory")
    except Exception as e:
        raise CustomException(e, sys)
        
    print(f"Filtered file name: {filtered_file_name}")
    combined_df = read_from_csv(file_path= filtered_file_name)
    

    repos_dict = {}
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = {executor.submit(fetch_and_update, row['username'], row['state']): row for _, row in combined_df.iterrows()}

        for future in as_completed(futures):
            state_name, df1 = future.result()
            if not df1.empty:
                if state_name in repos_dict:
                    repos_dict[state_name] = pd.concat([repos_dict[state_name], df1], ignore_index=True)
                else:
                    repos_dict[state_name] = df1
    create_directories(REPOSITORY_DATA_FILE_DIRECTORY_NAME)
    combined_df = pd.DataFrame()
    for state, df in repos_dict.items():
        # You can add a 'state' column if it's not already in the DataFrame
        df['state'] = state
        combined_df = pd.concat([combined_df, df], ignore_index=True)

    combined_df.to_csv(REPOSITORY_DATA_FILE_PATH, index=False)


In [13]:
# from os import listdir
# from os.path import join
# import pandas as pd
# import glob

# def download_users_data(sample_fraction, file_path: str):
#     sample_users_data_with_stratified_sampling(raw_data_file_path=file_path)

#     subdirectories = os.listdir(SAMPLED_USERS_DATASET_FILE_PATH)

#     subdirectories.sort(reverse=True)
#     print(f"subdirectories in the sampled users folder are: {subdirectories}")

#     latest_subdir = subdirectories[0] if subdirectories else None
#     filtered_file_name = ""
    

#     if latest_subdir:
#         search_path = join(SAMPLED_USERS_DATASET_FILE_PATH, latest_subdir, f"samples_of_size_{sample_fraction}_percentage.csv")
#         filtered_file_name = next(glob.iglob(search_path), None)
#         if not filtered_file_name:
#             raise CustomException(f"No files found in {SAMPLED_USERS_DATASET_FILE_PATH} directory")
#     else:
#         raise CustomException(f"No subdirectories found in {SAMPLED_USERS_DATASET_FILE_PATH}")

#     print(f"Filtered file name: {filtered_file_name}")
#     combined_df = read_from_csv(file_path= filtered_file_name)
#     combined_df = combined_df[0:5]
#     print(combined_df)

#     # repos_dict = {}
#     # for _, row in combined_df.iterrows():
#     #     state_name, user_name = row['state'], row['username']
#     #     try:
#     #         df1 = fetch_repos(username=user_name, state=state_name)
#     #         if not df1.empty:
#     #             repos_dict.setdefault(state_name, []).append(df1)
#     #     except Exception as e:
#     #         print(f"Error fetching data for {user_name}: {e}")

#     # for state, df_list in repos_dict.items():
#     #     repos_dict[state] = pd.concat(df_list, ignore_index=True)

#     # final_df = pd.concat(repos_dict.values(), ignore_index=True)
#     # create_directories(REPOSITORY_DATA_FILE_DIRECTORY_NAME)
#     # final_df.to_csv(REPOSITORY_DATA_FILE_PATH, index=False)

# # Example usage
# # download_users_data(3, "path_to_your_file.csv")


In [14]:
download_users_data(1, "D:\\DM Project\\Testing\\GitHubDataRetrieval\\data\\raw\\state_wise_user_names.csv")

subdirectories in the sampled users folder are: ['24-11-23_13-30-23']
Filtered file name: d:\DM Project\Testing\GitHubDataRetrieval\data\processed\sampled_user_names_files\24-11-23_13-30-23\samples_of_size_1_percentage.csv
merging inside the function....DONE
merging inside the function....DONE
merging inside the function....DONE
merging inside the function....DONE
merging inside the function....DONE
merging inside the function....DONE
merging inside the function....DONE
merging inside the function....DONE
merging inside the function....DONE
merging inside the function....DONE
merging inside the function....DONE
merging inside the function....DONE
merging inside the function....DONE
merging inside the function....DONE
merging inside the function....DONE
merging inside the function....DONE
merging inside the function....DONE
merging inside the function....DONE
merging inside the function....DONE
merging inside the function....DONE
merging inside the function....DONE
merging inside the fu