In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.font_manager as fm
import matplotlib as mpl
import os
import pickle
import plotly
import matplotlib.gridspec as gridspec
from tqdm import tqdm_notebook

font_name = fm.FontProperties(fname = "c:/Windows/Fonts/malgun.ttf").get_name()
plt.rc('font', family = font_name)
mpl.rcParams['axes.unicode_minus'] = False


In [3]:
os_cwd = os.getcwd()
Public_2017_directory = 'C:/team_project_data/2017_public.csv'
Public_2018_directory = 'C:/team_project_data/2018_public.csv'
Public_2017_data = pd.read_csv(Public_2017_directory)
Public_2018_data = pd.read_csv(Public_2018_directory)
print(Public_2017_data.shape)
Public_2017_data.head()


(51392, 154)


Unnamed: 0,Respondent,Professional,ProgramHobby,Country,University,EmploymentStatus,FormalEducation,MajorUndergrad,HomeRemote,CompanySize,...,StackOverflowMakeMoney,Gender,HighestEducationParents,Race,SurveyLong,QuestionsInteresting,QuestionsConfusing,InterestedAnswers,Salary,ExpectedSalary
0,1,Student,"Yes, both",United States,No,"Not employed, and not looking for work",Secondary school,,,,...,Strongly disagree,Male,High school,White or of European descent,Strongly disagree,Strongly agree,Disagree,Strongly agree,,
1,2,Student,"Yes, both",United Kingdom,"Yes, full-time",Employed part-time,Some college/university study without earning ...,Computer science or software engineering,"More than half, but not all, the time",20 to 99 employees,...,Strongly disagree,Male,A master's degree,White or of European descent,Somewhat agree,Somewhat agree,Disagree,Strongly agree,,37500.0
2,3,Professional developer,"Yes, both",United Kingdom,No,Employed full-time,Bachelor's degree,Computer science or software engineering,"Less than half the time, but at least one day ...","10,000 or more employees",...,Disagree,Male,A professional degree,White or of European descent,Somewhat agree,Agree,Disagree,Agree,113750.0,
3,4,Professional non-developer who sometimes write...,"Yes, both",United States,No,Employed full-time,Doctoral degree,A non-computer-focused engineering discipline,"Less than half the time, but at least one day ...","10,000 or more employees",...,Disagree,Male,A doctoral degree,White or of European descent,Agree,Agree,Somewhat agree,Strongly agree,,
4,5,Professional developer,"Yes, I program as a hobby",Switzerland,No,Employed full-time,Master's degree,Computer science or software engineering,Never,10 to 19 employees,...,,,,,,,,,,


In [None]:
Important_feature = {'Professional','ProgramHobby','Country','EmploymentStatus','FormalEducation',
                     'MajorUndergrad','CompanyType','YearsProgram','DeveloperType','CareerSatisfaction','ProblemSolving',
                     'BuildingThings','LearningNewTech','BoringDetails','JobSecurity',
                    'FriendsDevelopers','RightWrongWay','UnderstandComputers','SeriousWork','ChallengeMyself','ChangeWorld',
                    'ImportantBenefits','ImportantHiringAlgorithms','ImportantHiringTechExp','ImportantHiringCommunication',
                     'ImportantHiringOpenSource','ImportantHiringPMExp','ImportantHiringEducation','ImportantHiringGettingThingsDone',
                    'EnjoyDebugging','DifficultCommunication','HighestEducationParents','Gender','WorkPayCare'}


### 성격과 developer와의 연관성을 조사
# 능력과 관련된 column : ['FormalEducation','MajorUndergrad','FriendsDevelopers']

# 성격과 관련된 column : ['ProgramHobby','ProblemSolving','BuildingThings','LearningNewTech','BoringDetails','JobSecurity'
#                        'RightWrongWay','SeriousWork','ChallengeMyself','ChangeWorld','ImportantHiringEducation',
#                         'ImportantHiringGettingThingsDone' ,'DifficultCommunication']

# 언어와 관련된 column : ['HaveWorkedLanguage','WantWorkLanguage','HaveWorkedFramework','WantWorkFramework',
#                       ,'HaveWorkedDatabase','WantWorkDatabase','HaveWorkedPlatform','WantWorkPlatform']

# 개발자에게 중요하다고 생각하는 능력 : [UnderstandComputers','ImportantHiringAlgorithms','ImportantHiringTechExp','ImportantHiringOpenSource'
#                                        ,'ImportantHiringCommunication','ImportantHiringEducation','ImportantHiringGettingThingsDone']

# 현재의 상태와 관련된 column : ['DeveloperType','CareerSatisfaction','EmploymentStatus','professional','Country']
# 웹 개발자가 가장 많이 사용하는 언어 조사 : ['DeveloperType']
# 프론트 엔드/back end가 가장 많이 사용하는 언어 조사 

In [None]:
Public_2017_data['DeveloperType'].head()

In [None]:
def Hot_encoding(DataFrame, Column_name, sep='; ', dropna=True, threading=False, df_dict=None):
    item_set = set()
    def Unique_element_in_series(item):
        if item is np.nan:
            pass
        else:
            item_list = item.split("; ")
            item_set.update(item_list)

    if (dropna == True):
        Column_series = DataFrame[Column_name].dropna()
        Column_series.apply(Unique_element_in_series)
    else:
        Column_series = DataFrame[Column_name]
    print(Column_series.shape[0])
    print(len(item_set))
    Hot_encoded_df = pd.DataFrame(data=np.zeros((Column_series.shape[0],len(item_set))), index=Column_series.index, columns=item_set, dtype=np.int8, )
    for i in tqdm_notebook(Column_series.index):
        item_list = Column_series.loc[i].split(sep)
        for item in item_list:
            Hot_encoded_df[item].loc[i] = 1
    return Hot_encoded_df


def Hot_encoding_multi_processing(DataFrame, Column_names, sep='; ', dropna=True, df_dict=None,time_out = 1):
    from multiprocessing import Pool
    pool = Pool(processes=3)
    for Column_name in Column_names:
        def printResult(result):
            print(result)
        result = pool.apply_async(Hot_encoding, args=(DataFrame, Column_name,),callback = printResult)
        df_dict[Column_name] = result.get(timeout=time_out)
        print(Column_name)
    return df_dict

df_dict = dict()
Column_names = ['ProgramHobby','ProblemSolving','BuildingThings']
#,'LearningNewTech','BoringDetails','JobSecurity','RightWrongWay','SeriousWork','ChallengeMyself','ChangeWorld'
#df = Hot_encoding(Public_2017_data,'HaveWorkedLanguage')
df_dict = Hot_encoding_multi_processing(Public_2017_data,Column_names,df_dict = df_dict,time_out = 1000)

In [None]:
## 객체 저장 하는 방법
with open(os.getcwd()+'/data/data_object/Developer_language_dict.pkl', 'wb') as f:
    pickle.dump(Developer_dict, f)

In [None]:
## 객체 로드 하는 방법
directory = os.getcwd() + '/data/data_object/Developer_language_dict.pkl'
with open(directory, 'rb') as f :
    developement_language_dict = pickle.load(f)

In [None]:
Developetype_df = pd.read_csv(os.getcwd() +'/data/Developer_df/DeveloperType.csv',index_col = 0)

Developer_kind_list = []
Developer_number_list = []
for i in Developetype_df.columns :
    Developer_kind_list.append(i)
    Developer_number_list.append(Drop_zeros(Developetype_df,i).shape[0])
Developer_number_sr = pd.Series(Developer_number_list,index = Developer_kind_list)
Developer_number_sr = Developer_number_sr.sort_values()

## 객체 저장 하는 방법
directory = os.getcwd() + '/data/data_object/Developer_number_sr.pkl'
with open(directory, 'wb') as f :
    pickle.dump(Developer_number_sr,f)


In [None]:
directory = os.getcwd() + '/data/data_object/Developer_number_sr.pkl'
with open(directory, 'rb') as f :
    Developer_number_sr = pickle.load(f)

Developer_number_sr.plot(kind='barh')

In [None]:
with open(os.getcwd()+'/data/Developer_number_sr.pkl', 'wb') as f:
    pickle.dump(Developer_number_sr, f)

In [None]:
# I want to see popular language for big 5
# Language_column = ['HaveWorkedLanguage','WantWorkLanguage','HaveWorkedFramework','WantWorkFramework',
#                       ,'HaveWorkedDatabase','WantWorkDatabase','HaveWorkedPlatform','WantWorkPlatform']
# I think that it would be good to use 'Want' column for future job searching.
# Language_column ['WantWorkLanguage','WantWorkFramework','WantWorkDatabase','WantWorkPlatform']


In [None]:
Developertype =pd.read_csv(os.getcwd() +'/data/Developer_df/DeveloperType.csv',index_col = 0)

In [None]:
Developertype['Other']

In [None]:
## 개발자 별 성향 table
directory = os.getcwd() + '/data/data_object/Developer_number_sr.pkl'
with open(directory, 'rb') as f :
    Developer_number_sr = pickle.load(f)
    
Developertype =pd.read_csv(os.getcwd() +'/data/Developer_df/DeveloperType.csv',index_col = 0)
for i in Developer_number_sr[-3:].index:
    developer_series = Developertype['Other'][Developertype['Other']!=0]
    developer_df = Public_2017_data[['ProgramHobby','ProblemSolving','BuildingThings','LearningNewTech','BoringDetails','JobSecurity'
                                     ,'RightWrongWay','SeriousWork','ChallengeMyself','ChangeWorld','ImportantHiringEducation',
                                     'ImportantHiringGettingThingsDone' ,'DifficultCommunication','FormalEducation','MajorUndergrad','FriendsDevelopers'
                                        ,'Country']]
    developer_df = pd.merge(developer_series,developer_df,left_index=True,right_index=True)
    developer_df.to_csv(os.getcwd()+'/data/Developer_df/'+i+'.csv')
    developer_df.head()

In [None]:
# 하나만 불러오고 싶을때 
Developertype =pd.read_csv(os.getcwd() +'/data/Developer_df/DeveloperType.csv',index_col = 0)
directory = os.getcwd() + '/data/data_object/Developer_number_sr.pkl'
    
developer_series = Developertype['Other'][Developertype['Other']!=0]
developer_df = Public_2017_data[['ProgramHobby','ProblemSolving','BuildingThings','LearningNewTech','BoringDetails','JobSecurity'
                                     ,'RightWrongWay','SeriousWork','ChallengeMyself','ChangeWorld','ImportantHiringEducation',
                                     'ImportantHiringGettingThingsDone' ,'DifficultCommunication','FormalEducation','MajorUndergrad','FriendsDevelopers'
                                        ,'Country']]
developer_df = pd.merge(developer_series,developer_df,left_index=True,right_index=True)
developer_df.to_csv(os.getcwd()+'/data/Developer_df/'+'Other.csv')
developer_df.head()

In [None]:
developer_df = Public_2017_data[['ProgramHobby','ProblemSolving','BuildingThings','LearningNewTech','BoringDetails','JobSecurity'
                                     ,'RightWrongWay','SeriousWork','ChallengeMyself','ChangeWorld','ImportantHiringEducation',
                                     'ImportantHiringGettingThingsDone' ,'DifficultCommunication','FormalEducation','MajorUndergrad','FriendsDevelopers'
                                        ,'Country']]
developer_df = pd.merge(developer_series,developer_df,left_index=True,right_index=True)

In [None]:
developer_series = developer_series[developer_series != 0]

In [None]:
developer_series

In [None]:
developer_df

In [None]:
with open(os.getcwd() +'/data/data_object/Developer_number_sr.pkl', 'rb') as f :
    Developer_number_sr = pickle.load(f)

Developertype_df = pd.read_csv(os.getcwd() +'/data/Developer_df/DeveloperType.csv')


Top10_developement_type = Developer_number_sr[-10:]
Language_categories  = ['HaveWorkedLanguage','WantWorkLanguage','HaveWorkedFramework','WantWorkFramework','HaveWorkedDatabase','WantWorkDatabase','HaveWorkedPlatform','WantWorkPlatform']




Developer_dict = dict()
for column in Top10_developement_type.index:
    Developer_dict[column] = {'HaveWorkedLanguage':None ,'WantWorkLanguage' : None,'WantWorkFramework' : None,'HaveWorkedFramework': None,
                              'HaveWorkedDatabase': None,'WantWorkDatabase' : None,'HaveWorkedPlatform' : None,'WantWorkPlatform' : None}
    for category in Language_categories :
        Developer_dict[column][category] = count_each_item_in_Series(Public_2017_data[category].loc[Drop_zeros(Developertype_df,column).index].dropna()).sort_values()[-10:]


    

In [None]:
language_plot(developement_language_dict,'Web developer',kind = 'Barh')

In [None]:
with open(os.getcwd() +'/data/data_object/Developer_language_dict.pkl', 'rb') as f :
    developement_language_dict = pickle.load(f)
    
x=['HaveWorkedLanguage','WantWorkLanguage','HaveWorkedFramework','WantWorkFramework','HaveWorkedDatabase','WantWorkDatabase','HaveWorkedPlatform','WantWorkPlatform']
labels = ['1st','2nd','3rd','4th','5th']
Language_rank_dict = dict()
for developement_key in developement_language_dict.keys() :
    Language_rank_dict[developement_key] = dict()
    rank_count = 0
    for label in labels :
        Language_rank_dict[developement_key][label] = {"index": [],'value' : []}
        for Key in developement_language_dict[developement_key].keys() :
            index = developement_language_dict[developement_key][Key][-5:].sort_values(ascending = False).index[rank_count]
            if index == 'Amazon Web Services (AWS)' :
                index = 'AWS'
            value = developement_language_dict[developement_key][Key][-5:].sort_values(ascending = False)[rank_count]
            Language_rank_dict[developement_key][label]['index'].append(index)
            Language_rank_dict[developement_key][label]['value'].append(value)
        rank_count += 1
        
with open(os.getcwd()+'/data/data_object/Language_rank_dict.pkl', 'wb') as f:
    pickle.dump(Language_rank_dict, f)

In [None]:
with open(os.getcwd() +'/data/data_object/Language_rank_dict.pkl', 'rb') as f :
    Language_rank_dict = pickle.load(f)
labels = ['1st','2nd','3rd','4th','5th']
x = ['HaveWorkedLanguage','WantWorkLanguage','HaveWorkedFramework','WantWorkFramework','HaveWorkedDatabase','WantWorkDatabase','HaveWorkedPlatform','WantWorkPlatform']
plot_bar_chart_about_language_per_dev(Language_rank_dict,'Systems administrator',labels)
#def bar_chart_

In [None]:
import pandas as pd
directory = 'C:/Users/acorn/YDY_FOLDER/Python Analysis/팀 프로젝트/data/Developer_df/Desktop applications developer.csv'
desk_df = pd.read_csv(directory)

In [None]:
directory = 'C:/Users/acorn/YDY_FOLDER/Python Analysis/팀 프로젝트/data/Developer_df/Web developer.csv'
web_df = pd.read_csv(directory)
web_df

In [None]:
directory = 'C:/Users/acorn/YDY_FOLDER/Python Analysis/팀 프로젝트/data/Developer_df/Mobile developer.csv'
mobile_df = pd.read_csv(directory)

In [None]:
import matplotlib.pyplot as plt
!matplotlib inline
columns = ['ProgramHobby','ProblemSolving','BuildingThings','LearningNewTech','BoringDetails','JobSecurity',
           'RightWrongWay','SeriousWork','ChallengeMyself','ChangeWorld']
desk_dict = dict()
for column in columns :
    desk_dict[column] = desk_df[column].value_counts()
    desk_dict[column].plot(kind = 'barh')

In [None]:
web_dict.keys()

In [None]:
web_dict = dict()
for column in columns :
    web_dict[column] = web_df[column].value_counts()
    web_dict[column].plot(kind = 'barh')

In [None]:
mobile_dict = dict()
for column in columns :
    mobile_dict[column] = mobile_df[column].value_counts()
    mobile_dict[column].plot(kind = 'barh')

In [None]:
web_dict['ProblemSolving'].plot(kind = 'barh')

In [None]:
desk_dict['ProblemSolving'].plot(kind = 'barh')

In [None]:
mobile_dict['ProblemSolving'].plot(kind = 'barh')