In [None]:
import pandas as pd
import numpy as np
import datetime
import warnings
warnings.filterwarnings('ignore')
from IPython.display import Image, HTML

import time
from datetime import datetime, timedelta

## This script assumes the following files are available:

- commit-to-timestamp (c2ta) map for the other commits. 
- blob-to-commit (b2c) map for the hackathon commits 
- commit-to-project (c2P) map for the other commits 
- project classification file ('discretized_new.csv')
- Blob file type csv file ('Hack_b2type.csv')
- Project info from DEVPOST ('PrAllInfoDEVPOST.csv')
- DEVPOST project-to-wocURL information (woc-urls.csv) csv file 
- Project Info from MongoDB ('PrAllInfo.csv')
- CSV file for the results from CodeGeneration notebook ('finalResultDF.csv')

See README for corresponding commands using World of Code tool.

### UsageFlag Calculation

In [None]:
CommitsDF = pd.read_csv('SampleData/c2taAll.csv',sep=';')
b2cDF = pd.read_csv('SampleData/b2c.csv',sep=';')
c2PDF = pd.read_csv('SampleData/c2PAll.csv',sep=';')

CompareDF3 = pd.read_csv('SampleData/finalResultDF.csv',sep=';')  ## The output of the CodeGeneration notebook
PrCatDF = pd.read_csv('discretized_new.csv', sep=',')
PrInfoDEVPOST = pd.read_csv('PrAllInfoDEVPOST.csv', sep=';')
PrInfo2 = pd.read_csv('PrAllInfo.csv', sep=';')
wocUrl = pd.read_csv('woc-urls.csv',sep = ',')

In [None]:
# Filters

del CompareDF3['Unnamed: 0']
CompareDF3 = CompareDF3[CompareDF3['AuthorFlag'].isin([1,2])]   ## Author and Co-Author
CompareDF3 = CompareDF3[CompareDF3['TimingFlag'].isin([2])]   ## During only


In [None]:
# Join to get all commits of each blob
Comb1 = pd.merge(CompareDF3, b2cDF2 , how='inner', left_on = ['BlobHash'], right_on = ['BlobHash']).drop_duplicates()#[['devpost_id','ProjectID','hackathonStartDate','hackathonEndDate','BlobHash','FirstTimestamp','FirstAuthorID','TimingFlag','AuthorFlag']].drop_duplicates()


In [None]:
# Join to add commit timestamp

Comb2 = pd.merge(Comb1, CommitsDF2 , how='inner', left_on = ['BCommitHash'], right_on = ['CommitHash']).drop_duplicates()#[['devpost_id','ProjectID','hackathonStartDate','hackathonEndDate','BlobHash','FirstTimestamp','FirstAuthorID','TimingFlag','AuthorFlag']].drop_duplicates()
del Comb2['BCommitHash']


In [None]:
# Filter
Comb3 = Comb2.loc[(Comb2['CTimeStamp'] >= Comb2['hackathonStartDate'])]  ## Only Commits after hackathon start date


In [None]:
# Join to add project for each commit

Comb4 = pd.merge(Comb3, c2P_FullDF , how='inner', left_on = ['CommitHash'], right_on = ['CommitHash']).drop_duplicates()


In [None]:
# Join to add project classification of each commit

Comb5 = pd.merge(Comb4, PrCatDF , how='inner', left_on = ['CProject'], right_on = ['projectID']).drop_duplicates()


In [None]:
# logic to get UsageFlag

def checkUsage(row):
    HackProject = row.ProjectID_x
    CommitProject = row.CProject
    
    if(HackProject == CommitProject):
        return '1'   # Same Hackathon Project
    elif(row.flag == 1):
        return '3'   # Usage in a Small Projects
    elif(row.flag == 2):
        return '4'   # Used in Medium Projects
    elif(row.flag == 3):
        return '5'   # Used in Large Projects


    
def iterrows_impl(df):
    return pd.Series(
        checkUsage(row)     
        for row in df.itertuples()
    )
  

Comb5['UsageFlag'] = pd.Series(iterrows_impl(Comb5))

Comb5

In [None]:
# Concat the results on the correct level

result = Comb5[['devpost_id','ProjectID_x','hackathonStartDate','hackathonEndDate','BlobHash','TimingFlag','AuthorFlag','UsageFlag']].groupby(['devpost_id','ProjectID_x','hackathonStartDate','hackathonEndDate','BlobHash','TimingFlag','AuthorFlag']).agg({'UsageFlag':lambda x : ','.join(set(x))}).reset_index()
print(set(result['UsageFlag']))
result.to_csv('final_result.csv',sep=';')

In [None]:
finalResult = result.copy()
finalResult

In [None]:

result2 = finalResult[['BlobHash','UsageFlag']].groupby(['UsageFlag']).agg(['count'])
result2['Percentage'] = result2.apply(lambda x: 100 * x / float(x.sum()))


In [None]:
from matplotlib.pyplot import figure
from matplotlib import pyplot as plt 
from matplotlib.gridspec import GridSpec

figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')

x = result2.sort_values(by=['Percentage']).reset_index()['UsageFlag'].values.flatten()
data = result2.sort_values(by=['Percentage'])[['Percentage']].values.flatten() 

x_pos = [i for i, _ in enumerate(x)]

plt.barh(x_pos, data, color='black')
plt.ylabel("Blob Usage Category")
plt.xlabel("Percentage")
plt.title("Percentage of usage categories")

plt.yticks(x_pos, x)

for i, v in enumerate(data):
    plt.text(v +0.5, i -0.1, str(round(v,2)) + '%', color='Black')

plt.show()

In [None]:
from matplotlib.pyplot import figure
figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')

x = result2.sort_values(by=['Percentage']).reset_index()['UsageFlag'].values.flatten()
data = result2.sort_values(by=['Percentage'])[['BlobHash']].values.flatten() 

x_pos = [i for i, _ in enumerate(x)]

plt.barh(x_pos, data,log=True, color='black')
plt.ylabel("Blob Usage category")
plt.xlabel("Blob Count (Log Scale)")
plt.title("Blob Count for each usage category ( Log Scale )")

plt.yticks(x_pos, x)

plt.show()

In [None]:
# logic to get UsageFlagMax
# To calculate the maximum usage

def checkUsageMax(row):
    UsageFlagL = str(row.UsageFlag).split(",")        
    UsageFlagMax = max(UsageFlagL)
    return UsageFlagMax


    
def iterrows_impl(df):
    return pd.Series(
        checkUsageMax(row)     
        for row in df.itertuples()
    )
  

finalResult['UsageFlagMax'] = pd.Series(iterrows_impl(finalResult))


In [None]:
print(set(finalResult['UsageFlagMax']))
finalResult.shape

result3 = finalResult[['BlobHash','UsageFlagMax']].groupby(['UsageFlagMax']).agg(['count'])
result3['Percentage'] = result3.apply(lambda x: 100 * x / float(x.sum()))
result3


In [None]:
from matplotlib.pyplot import figure
figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')

x = result3.sort_values(by=['Percentage']).reset_index()['UsageFlagMax'].values.flatten()
data = result3.sort_values(by=['Percentage'])[['Percentage']].values.flatten() 

x_pos = [i for i, _ in enumerate(x)]

plt.barh(x_pos, data, color='black')
plt.ylabel("Blob Usage Category Max")
plt.xlabel("Percentage")
plt.title("Percentage of usage categories Max")

plt.yticks(x_pos, x)

for i, v in enumerate(data):
    plt.text(v + 1, i -0.1, str(round(v,2)) + '%', color='Black')
    
plt.show()

In [None]:
from matplotlib.pyplot import figure
figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')

x = result3.sort_values(by=['Percentage']).reset_index()['UsageFlagMax'].values.flatten()
data = result3.sort_values(by=['Percentage'])[['BlobHash']].values.flatten() 

x_pos = [i for i, _ in enumerate(x)]

plt.barh(x_pos, data, color='black')
plt.ylabel("Blob Usage Category Max")
plt.xlabel("Blob Count")
plt.title("Blob Count for each usage categories Max")

plt.yticks(x_pos, x)

for i, v in enumerate(data):
    plt.text(v + 1, i -0.1, str(round(v,2)) , color='Black')
    

    
plt.show()

In [None]:
finalResult.to_csv('finalResult.csv',sep=';' ,index=False)

In [None]:
Comb5.to_csv('Comp5.csv',sep=';' ,index=False)

# Data Transformation for further analysis

### Calculate Pivot for Blob Usage per project on week basis

In [None]:
Comb6 = Comb5.copy()

In [None]:
# Filter any timestamp fields

mask = Comb6['CTimeStamp'].str.len() == 19
Comb6 = Comb6.loc[mask].reset_index()

In [None]:
# Apply a flag for commits less than 2 years from hackathon end date

from dateutil.relativedelta import relativedelta

Comb6['hackathonEndDate'] =  pd.to_datetime(Comb6['hackathonEndDate'], format='%Y-%m-%d %H:%M:%S')
Comb6['CTimeStamp'] =  pd.to_datetime(Comb6['CTimeStamp'], format='%Y-%m-%d %H:%M:%S')

def check2YearsFlag(row):
    CTimeStamp = row.CTimeStamp
    hackathonEndDate = row.hackathonEndDate
    hEndDateWith2Years = row.hackathonEndDate + relativedelta(months=+24)
    if(CTimeStamp < hEndDateWith2Years):
        return '1'
    else:
        return '0'

    
def iterrows_impl(df):
    return pd.Series(
        check2YearsFlag(row)     
        for row in df.itertuples()
    )
  

Comb6['2YearsFlag'] = pd.Series(iterrows_impl(Comb6))


In [None]:
# Apply the two years filter

Comb7 = Comb6[Comb6['2YearsFlag']=='1'].reset_index().copy()

In [None]:
# Update column data type

Comb7['hackathonEndDate'] =  pd.to_datetime(Comb7['hackathonEndDate'], format='%Y-%m-%d %H:%M:%S')
Comb7['CTimeStamp'] =  pd.to_datetime(Comb7['CTimeStamp'], format='%Y-%m-%d %H:%M:%S')
Comb7['hackathonStartDate'] =  pd.to_datetime(Comb7['hackathonStartDate'], format='%Y-%m-%d %H:%M:%S')


In [None]:
# Calculate Week Numbers

def WeekCalc(row):
    CTimeStampDD = (row.CTimeStamp - timedelta(days=row.CTimeStamp.weekday()))
    hackathonEndDateDD = (row.hackathonEndDate - timedelta(days=row.hackathonEndDate.weekday()))
    weekDifference = (CTimeStampDD - hackathonEndDateDD).days / 7
    if(weekDifference < 0):
        return 0
    else:
        return int(round(weekDifference, 0))
    
    
def iterrows_impl(df):
    return pd.Series(
        WeekCalc(row)     
        for row in df.itertuples()
    )
  

Comb7['WeekNumber'] = pd.Series(iterrows_impl(Comb7))


In [None]:
# Delete unwanted columns

del Comb7['ProjectID']
del Comb7['MainProjectL']
del Comb7['2YearsFlag']
del Comb7['level_0']
del Comb7['index']

In [None]:
# Grouping to correct level with concat
dd = Comb7[['BlobHash','WeekNumber','UsageFlag']].groupby(['BlobHash','WeekNumber']).agg({'UsageFlag':lambda x : ','.join(set(x))}).reset_index()

In [None]:
# Calculate the maximum usage

def checkUsageMax(row):
    UsageFlagL = str(row.UsageFlag).split(",")        
    UsageFlagMax = max(UsageFlagL)
    return UsageFlagMax


    
def iterrows_impl(df):
    return pd.Series(
        checkUsageMax(row)     
        for row in df.itertuples()
    )
  

dd['UsageFlagMax'] = pd.Series(iterrows_impl(dd))


In [None]:
WeekData = dd[['WeekNumber','UsageFlagMax','BlobHash']].drop_duplicates().groupby(['WeekNumber','UsageFlagMax']).count().reset_index()
WeekData

In [None]:
# Build pivot table

pivotWeekData = pd.pivot_table(WeekData,index=['WeekNumber'],values=['BlobHash'],columns=['UsageFlagMax'],aggfunc=[np.sum],fill_value=0).reset_index()
pivotWeekData.columns = ['WeekNumber','1','3','4','5']


In [None]:
pivotWeekData['TotalUsed'] = pivotWeekData['1'] + pivotWeekData['3'] + pivotWeekData['4'] + pivotWeekData['5'] 
pivotWeekData

In [None]:
pivotWeekData['TotalUnused'] = 581579 - pivotWeekData['TotalUsed']  ## 581579: Total unique blobs
pivotWeekData

In [None]:
pivotWeekData.to_csv('pivotWeekData.csv', sep=';', index=False)

### Calculate RQ3 a and b

In [None]:
# Add flag for any usage instance

ProjectsUsedinOtherProjectsDF = Comb5[['ProjectID_x','BlobHash','UsageFlag']].drop_duplicates().reset_index()
del ProjectsUsedinOtherProjectsDF['index']

ProjectsUsedinOtherProjectsDF.loc[ProjectsUsedinOtherProjectsDF['UsageFlag'] != '1', 'ContinuationFlag'] = 1 
ProjectsUsedinOtherProjectsDF.loc[ProjectsUsedinOtherProjectsDF['UsageFlag'] == '1', 'ContinuationFlag'] = 0 

ProjectsUsedinOtherProjectsDF

In [None]:
# Grouping and calculating the percentage of usage per project

PratioDF = ProjectsUsedinOtherProjectsDF.groupby(['ProjectID_x']).agg({'ContinuationFlag':'sum', 'BlobHash': 'count'}).reset_index()
PratioDF.columns = ['ProjectID_x','ContinuationBlobCount','TotalBlobCount']
PratioDF['Ratio'] = (PratioDF['ContinuationBlobCount'] / PratioDF['TotalBlobCount'])
PratioDF

In [None]:

PrInfoAll = PrInfo2
PrInfoAll

In [None]:
# Add project Info from WoC
PratioVariablesDF = pd.merge( PratioDF, PrInfoAll, how='left', left_on=['ProjectID_x'], right_on=['projectID'])
PratioVariablesDF

In [None]:
PrInfoDEVPOST = PrInfoDEVPOST[['id','hackathon-id','number-of-technologies','technologies','number-of-participants','likes','comments','hackathon-prize-money','hackathon-number-of-prizes','hackathon-is-colocated','winner']].drop_duplicates()
PrInfoDEVPOST

In [None]:
del wocUrl['Unnamed: 4']
del wocUrl['Unnamed: 5']
wocUrl

In [None]:
# Add project Info from DEVPOST

PrInfoAllDEVPOST = pd.merge(PrInfoDEVPOST, wocUrl, how='inner', left_on=['id','hackathon-id'], right_on=['devpost_id','hackathon_id'])[['woc_url','number-of-technologies','technologies','number-of-participants','likes','comments','hackathon-prize-money','hackathon-number-of-prizes','hackathon-is-colocated','winner']].drop_duplicates()
PrInfoAllDEVPOST.columns = ['projectID', 'number-of-technologies','technologies','number-of-participants','likes','comments','hackathon-prize-money','hackathon-number-of-prizes','hackathon-is-colocated','winner']
PrInfoAllDEVPOST

In [None]:
finalProjectInfo = pd.merge(PrInfoAllDEVPOST, PratioVariablesDF, how='inner', left_on=['projectID'], right_on=['ProjectID_x']).drop_duplicates()
finalProjectInfo = finalProjectInfo[['projectID_x', 'number-of-technologies', 'technologies',
       'number-of-participants', 'likes', 'comments', 'hackathon-prize-money',
       'hackathon-number-of-prizes', 'hackathon-is-colocated', 'winner',
       'ProjectID_x', 'projectID_y',
       'numStars', 'NumAuthors', 'NumBlobs', 'rootFork', 'communitySize',
       'NumFiles', 'NumCommits', 'EarlistCommitDate', 'LatestCommitDate',
       'FileInfo','TotalBlobCount','ContinuationBlobCount','Ratio']]
finalProjectInfo

In [None]:
finalProjectInfo.columns = ['projectID_x', 'number-of-technologies', 'technologies',
       'number-of-participants', 'likes', 'comments', 'hackathon-prize-money',
       'hackathon-number-of-prizes', 'hackathon-is-colocated', 'winner',
       'ProjectID_x', 'projectID_y', 'numStars', 'NumAuthors', 'NumBlobs',
       'rootFork', 'communitySize', 'NumFiles', 'NumCommits',
       'EarlistCommitDate', 'LatestCommitDate', 'FileInfo', 'TotalBlobs',
       'sumContinuedBlobs', 'Ratio']

del finalProjectInfo['ProjectID_x']
del finalProjectInfo['projectID_y']
finalProjectInfo

In [None]:
finalProjectInfo.to_csv('RQ3a.csv', sep=';',index=False)

In [None]:
# Add flag for any usage instance in large open source project

ProjectsUsedinOtherProjectsDF = Comb5[['ProjectID_x','BlobHash','UsageFlag']].drop_duplicates().reset_index()
del ProjectsUsedinOtherProjectsDF['index']

ProjectsUsedinOtherProjectsDF.loc[ProjectsUsedinOtherProjectsDF['UsageFlag'] == '5', 'ContinuationFlag'] = 1 
ProjectsUsedinOtherProjectsDF.loc[ProjectsUsedinOtherProjectsDF['UsageFlag'] != '5', 'ContinuationFlag'] = 0 

ProjectsUsedinOtherProjectsDF

In [None]:
# Grouping and calculating the percentage of usage per project

PratioDF = ProjectsUsedinOtherProjectsDF.groupby(['ProjectID_x']).agg({'ContinuationFlag':'sum', 'BlobHash': 'count'}).reset_index()
# ProjectsUsedinOtherProjectsDF['BlobHash'] = pd.to_numeric(ProjectsUsedinOtherProjectsDF['BlobHash'])
PratioDF.columns = ['ProjectID_x','ContinuationBlobCount','TotalBlobCount']
PratioDF['Ratio'] = (PratioDF['ContinuationBlobCount'] / PratioDF['TotalBlobCount'])
PratioDF

In [None]:
# Add project Info from WoC

PratioVariablesDF = pd.merge( PratioDF, PrInfoAll, how='left', left_on=['ProjectID_x'], right_on=['projectID'])
PratioVariablesDF

In [None]:

PrInfoDEVPOST = PrInfoDEVPOST[['id','hackathon-id','number-of-technologies','technologies','number-of-participants','likes','comments','hackathon-prize-money','hackathon-number-of-prizes','hackathon-is-colocated','winner']].drop_duplicates()
PrInfoDEVPOST

In [None]:
# Add project Info from DEVPOST

PrInfoAllDEVPOST = pd.merge(PrInfoDEVPOST, wocUrl, how='inner', left_on=['id','hackathon-id'], right_on=['devpost_id','hackathon_id'])[['woc_url','number-of-technologies','technologies','number-of-participants','likes','comments','hackathon-prize-money','hackathon-number-of-prizes','hackathon-is-colocated','winner']].drop_duplicates()
PrInfoAllDEVPOST.columns = ['projectID', 'number-of-technologies','technologies','number-of-participants','likes','comments','hackathon-prize-money','hackathon-number-of-prizes','hackathon-is-colocated','winner']
PrInfoAllDEVPOST

In [None]:
finalProjectInfo = pd.merge(PrInfoAllDEVPOST, PratioVariablesDF, how='inner', left_on=['projectID'], right_on=['ProjectID_x']).drop_duplicates()
finalProjectInfo = finalProjectInfo[['projectID_x', 'number-of-technologies', 'technologies',
       'number-of-participants', 'likes', 'comments', 'hackathon-prize-money',
       'hackathon-number-of-prizes', 'hackathon-is-colocated', 'winner',
       'ProjectID_x', 'projectID_y',
       'numStars', 'NumAuthors', 'NumBlobs', 'rootFork', 'communitySize',
       'NumFiles', 'NumCommits', 'EarlistCommitDate', 'LatestCommitDate',
       'FileInfo','TotalBlobCount','ContinuationBlobCount','Ratio']]
finalProjectInfo

In [None]:
finalProjectInfo.columns = ['projectID_x', 'number-of-technologies', 'technologies',
       'number-of-participants', 'likes', 'comments', 'hackathon-prize-money',
       'hackathon-number-of-prizes', 'hackathon-is-colocated', 'winner',
       'ProjectID_x', 'projectID_y', 'numStars', 'NumAuthors', 'NumBlobs',
       'rootFork', 'communitySize', 'NumFiles', 'NumCommits',
       'EarlistCommitDate', 'LatestCommitDate', 'FileInfo', 'TotalBlobs',
       'sumContinuedBlobs', 'Ratio']

del finalProjectInfo['ProjectID_x']
del finalProjectInfo['projectID_y']
finalProjectInfo

In [None]:
finalProjectInfo.to_csv('RQ3b.csv', sep=';',index=False)

### Calculate what are the fractions of different blob types for each project

In [None]:
Hack_b2type = pd.read_csv('Hack_b2type.csv', sep=',')
Hack_b2type

In [None]:
# Add datasets for blob to hackathon project info

hbacDF = pd.read_csv('../CodeGeneration/hbac-20201108.csv',sep=';', encoding = "ISO-8859-1")
hcbDF = pd.read_csv('../CodeGeneration/hcb-20201029a.csv',sep=';')
hpcDF = pd.read_csv('../CodeGeneration/hpc-20201028.csv',sep=';')


In [None]:
# Joining
BC = pd.merge(hbacDF, hcbDF, how='inner', left_on=['BlobHash'], right_on=['BlobHash'])[['BlobHash','CommitHash','FirstTimestamp','FirstAuthorID','FirstCommitHash']]
BCP = pd.merge(BC, hpcDF, how='inner', left_on=['CommitHash'], right_on=['CommitHash'])[['BlobHash','ProjectID','FirstTimestamp','FirstAuthorID','FirstCommitHash']].drop_duplicates()
BCP

In [None]:
# Join with blob file type
CodeInfoDF = pd.merge(BCP, Hack_b2type, how='inner', left_on=['BlobHash'], right_on=['Blob']).drop_duplicates()

In [None]:
# Count of blobs per project

BlobCountProject = CodeInfoDF.groupby(['ProjectID']).BlobHash.agg('count').to_frame('TotalBlobsPerProject').reset_index()


In [None]:
# get count of blobs per project, type

CodeInfoGroup = CodeInfoDF[['ProjectID','Type','BlobHash']].groupby(['ProjectID','Type']).agg({'BlobHash': 'count'}).reset_index()
CodeInfoGroup.columns = ['ProjectID','Type','TotalBlobsPerType']

In [None]:
# Join dataframes

CodeInfoGroup2 = pd.merge(CodeInfoGroup,BlobCountProject, how='inner', left_on=['ProjectID'], right_on=['ProjectID'])
CodeInfoGroup2['BlobTypeRatio'] = CodeInfoGroup2['TotalBlobsPerType'] / CodeInfoGroup2['TotalBlobsPerProject']
CodeInfoGroup2

In [None]:
# Build Pivot

pivotCodeInfoGroup = pd.pivot_table(CodeInfoGroup2,index=['ProjectID'],values=['BlobTypeRatio'],columns=['Type'],aggfunc=[np.sum],fill_value=0).reset_index()
pivotCodeInfoGroup.columns = ['ProjectID','pctOther','pctData','pctMarkup','pctCode','pctProse']
pivotCodeInfoGroup.to_csv('pivotCodeInfoGroup.csv', sep=';', index=False)
pivotCodeInfoGroup

### Regenerate the graphs as per new requirements

In [None]:
# Remove instances where code was not reused

Comb8 = Comb5[Comb5['UsageFlag'] != '1'].copy()
Comb8["UsageFlag"].replace({"3": "1", "4": "2","5": "3"}, inplace=True)

In [None]:
# Concat and group on correct level

result = Comb8[['devpost_id','ProjectID_x','hackathonStartDate','hackathonEndDate','BlobHash','TimingFlag','AuthorFlag','UsageFlag']].groupby(['devpost_id','ProjectID_x','hackathonStartDate','hackathonEndDate','BlobHash','TimingFlag','AuthorFlag']).agg({'UsageFlag':lambda x : ','.join(set(x))}).reset_index()

In [None]:

rr = result[['BlobHash','UsageFlag']].groupby(['UsageFlag']).agg(['count'])
rr['Percentage'] = rr.apply(lambda x: 100 * x / float(x.sum()))
rr


In [None]:
from matplotlib.pyplot import figure
from matplotlib import pyplot as plt 
from matplotlib.gridspec import GridSpec
from pylab import figure, text, scatter, show

figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')

x = rr.sort_values(by=['Percentage']).reset_index()['UsageFlag'].values.flatten()
data = rr.sort_values(by=['Percentage'])[['Percentage']].values.flatten() 

x_pos = [i for i, _ in enumerate(x)]

plt.barh(x_pos, data, color='black')
plt.ylabel("Blob Usage Category")
plt.xlabel("Percentage")
plt.title("Percentage of usage categories")

text(51, 0.4, '1: Small', fontsize=12)
text(51, 0, '2: Medium', fontsize=12)
text(51,-0.4, '3: Large', fontsize=12)


plt.yticks(x_pos, x)

for i, v in enumerate(data):
    plt.text(v +0.5, i -0.1, str(round(v,2)) + '%', color='Black')

plt.show()

In [None]:
# logic to get UsageFlagMax
# For maximum usage

def checkUsageMax(row):
    UsageFlagL = str(row.UsageFlag).split(",")        
    UsageFlagMax = max(UsageFlagL)
    return UsageFlagMax


    
def iterrows_impl(df):
    return pd.Series(
        checkUsageMax(row)     
        for row in df.itertuples()
    )
  

result['UsageFlagMax'] = pd.Series(iterrows_impl(result))


In [None]:

result4 = result[['BlobHash','UsageFlagMax']].groupby(['UsageFlagMax']).agg(['count'])
result4['Percentage'] = result4.apply(lambda x: 100 * x / float(x.sum()))
result4 = result4.reset_index()
result4.columns = ['UsageFlagMax','BlobCount', 'BlobPercentage']
# result4["UsageFlagMax"].replace({"1": "[1] Small", "2": "[2] Medium","3": "[3] Large"}, inplace=True)
result4.reset_index()


In [None]:
from matplotlib.pyplot import figure
from pylab import figure, text, scatter, show
import matplotlib.patches as mpatches
figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')

x = result4.sort_values(by=['BlobPercentage']).reset_index()['UsageFlagMax'].values.flatten()
data = result4.sort_values(by=['BlobPercentage'])[['BlobPercentage']].values.flatten() 

x_pos = [i for i, _ in enumerate(x)]

plt.barh(x_pos, data, color='black')
plt.ylabel("Blob Usage Category Max")
plt.xlabel("Percentage")
plt.title("Percentage of usage categories Max")

text(51, -0.2, '1: Small', fontsize=12)
text(51, -0.3, '2: Medium', fontsize=12)
text(51,-0.4, '3: Large', fontsize=12)


plt.yticks(x_pos, x)

for i, v in enumerate(data):
    plt.text(v + 1, i -0.1, str(round(v,2)) + '%', color='Black')
    
plt.show()