In [1]:
import warnings
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from IPython.core.display import HTML
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 500)

#Loading Datasets
response_2022=pd.read_csv('/kaggle/input/kaggle-survey-2022/kaggle_survey_2022_responses.csv')
response_2021=pd.read_csv('/kaggle/input/kaggle-survey-2021/kaggle_survey_2021_responses.csv')
response_2020=pd.read_csv('/kaggle/input/kaggle-survey-2020/kaggle_survey_2020_responses.csv')

# CSS styling for markdown
styling = """
    <style>
        .main-heading{
            background-color: #4da67a;
            color: white !important;
            
            font-size: 32px !important;
            padding: 12px 12px;
            margin-bottom: 5px;
            border-radius: 4px;
            box-shadow: rgba(0, 0, 0, 0.19) 0px 10px 10px, rgba(0, 0, 0, 0.23) 0px 6px 6px;
            border-radius: 25px;
            
            
        }

        .sub-heading{
            width: auto !important;
            background-color: #7aa64d;
            color: white !important;
            
            font-size: 24px !important;
            padding: 10px 12px;
            margin-bottom: 3px;
            box-shadow: rgba(0, 0, 0, 0.16) 0px 3px 6px, rgba(0, 0, 0, 0.23) 0px 3px 6px;
            border-radius: 20px;
        }
        
        .default-font{
            
            font-size: 15px ;
        }
        .graph-q-font{
            
            font-size: 12px !important;
        }
        .graph-heading{
            border-bottom: 5px solid #7aa64d;
            width: auto !important;
            color: #7aa64d !important;
            
            font-size: 24px !important;
            padding: 10px 12px;
            margin-bottom: 3px;
        }
        .ref-heading{
            text-decoration: underline;
            color: #7aa64d !important;
            
            font-size: 24px !important;
            padding: 10px 12px;
            margin-bottom: 3px;
        }
        .bold-text{
            width: auto !important;
            color: #7aa64d !important;
            
            font-size: 18px !important;
            padding: 10px 12px;
           
        }
    </style>
"""
HTML(styling)

In [2]:
## User Defined Functions

# This user defined function works as 'pivot' function. It Takes a grouped by dataframe and creates a row for each column value 
# into a new dataframe and returns the new dataframe
def get_vertical_df(df,idx_col,pcol_nm):
    
    new_resl = {idx_col: [],
        pcol_nm: [],
        'Count': []}
    df_marks = pd.DataFrame(new_resl)

    col_list=df.columns.to_list()
    col_list.remove(idx_col)
    col_list
    for index, row in df.iterrows():
        for col in col_list:
            new_row = {idx_col:row[idx_col], pcol_nm:col, 'Percentage':row[col]}
            df_marks = df_marks.append(new_row, ignore_index=True)
    
    return df_marks

# This function is used to remove all the first brackets'()' from the column name 
def remove_after_text(pstring,ch):
    # pstring is the string
    # ch is the character after which all characters will be removed from pstring
    
    indx=pstring.find(ch)
    if indx>0:
        pstring=pstring[0:indx]
    return pstring


def get_summary_df(df,keyword,keyword_name,remove_after=False):
    
    course_columns=df.filter(regex='^'+keyword+'_',axis=1).columns.to_list()
    rpt_col=course_columns
    
    df_gby=df[rpt_col].copy()
    df_gby=df_gby.count().rename('Count').reset_index()
    Total = df_gby['Count'].sum()
    df_gby['Count']=df_gby['Count'].div(Total).mul(100).round(2)
    df_gby['index'] = df_gby['index'].str.replace(keyword+'_','')
    
    if (remove_after==True):
        df_gby['index'] = df_gby['index'].apply(lambda x: remove_after_text(x,'('))
    df_gby.rename(columns = {'index':keyword_name,'Count':'Percentage'}, inplace = True)
    
    
    return df_gby


def get_industry_wise_df (df,keyword,keyword_name,remove_after=False):
    
    course_columns=df.filter(regex='^'+keyword+'_',axis=1).columns.to_list()
    rpt_col=['Industry']+course_columns
    df_gby=df[rpt_col].copy()
    df_gby=df_gby.groupby(['Industry']).count().reset_index()
    
    df_gby['Total']=df_gby.sum(axis=1)
    df_gby[ course_columns]=df_gby[course_columns].div(df_gby['Total'], axis=0).mul(100).round(2)
    df_gby.drop(['Total'], axis=1,inplace=True)
    vert_df=get_vertical_df(df_gby,'Industry',keyword_name)
    vert_df.head(10)
    
    
    vert_df[keyword_name] = vert_df[keyword_name].str.replace(keyword+'_','')
    if (remove_after==True):
        vert_df[keyword_name] = vert_df[keyword_name].apply(lambda x: remove_after_text(x,'('))
    
    return vert_df

def get_base_keword_wise_df (df,base,keyword,keyword_name,remove_after=False):
    #  df is dataset
    # 'CV_None' column in p_df indicates whether a respondent uses CV or not .This column name is passed through 'base' parameter
    # 'keyword' is the column name prefix, based on which the number of computer vision users and non-users is calculated. For example 
    # 'Cloud Computing Platform' is denoted by column name prefix 'CCP'. 
    
    course_columns=df.filter(regex='^'+keyword+'_',axis=1).columns.to_list()
    rpt_col=[base]+course_columns
    df_gby=df[rpt_col].copy()
    df_gby=df_gby.groupby([base]).count().reset_index()
    
    df_gby['Total']=df_gby.sum(axis=1)
    df_gby[ course_columns]=df_gby[course_columns].div(df_gby['Total'], axis=0).mul(100).round(2)
    df_gby.drop(['Total'], axis=1,inplace=True)
    vert_df=get_vertical_df(df_gby,base,keyword_name)
    vert_df.head(10)
    
    
    vert_df[keyword_name] = vert_df[keyword_name].str.replace(keyword+'_','')
    if (remove_after==True):
        vert_df[keyword_name] = vert_df[keyword_name].apply(lambda x: remove_after_text(x,'('))
    
    return vert_df

# This function is used to replace the industry name with short names for easy ploting on graph
def industry_name_mapping(x):
    # x industry name
    
    map_val=""
    if (x=='Shipping/Transportation'):
        map_val='Ship/Trans'
    elif (x=='Computers/Technology'):
        map_val='Com/Tech'
    elif (x=='Academics/Education'):
        map_val='Academics/Edu'
    elif (x=='Accounting/Finance'):
        map_val='Accounting'
    elif (x=='Government/Public Service'):
        map_val='Government'
    elif (x=='Medical/Pharmaceutical'):
        map_val='Medical/Pharma'
    elif (x=='Manufacturing/Fabrication'):
        map_val='Mfs/Fabrication'
    elif (x=='Insurance/Risk Assessment'):
        map_val='Ins/Risk Assessment'
    elif (x=='Online Service/Internet-based Services'):
        map_val='Online Service'
    elif (x=='Broadcasting/Communications'):
        map_val='Broadcasting'
    else:
        map_val=x
    return map_val

# This function is used to replace the computer vision methods name with short names for easy ploting on graph
def cv_name_mapping(x):
    # x computer vision method name
    map_val=""
    if (x=='General purpose image/video tools '):
        map_val='General img/video tools '
    elif (x=='Image segmentation methods '):
        map_val='Img segmentation'
    elif (x=='Object detection methods '):
        map_val='Obj detection'
    elif (x=='Image classification and other general purpose networks '):
        map_val='Img claasify/General Nets '
    elif (x=='Generative Networks '):
        map_val='GAN'
    elif (x=='Vision transformer networks '):
        map_val='ViT'
    else:
        map_val=x
    return map_val

# This function is used to replace the country name with short names for easy ploting on graph
def country_name_mapping(x):
    map_val=""
    
    if (x=='United Arab Emirates'):
        map_val='UAE'
    elif (x=='I do not wish to disclose my location'):
        map_val='Unknown'
    elif (x=='Iran, Islamic Republic of...'):
        map_val='Iran'
    elif (x=='United Kingdom of Great Britain and Northern Ireland'):
        map_val='UK'
    elif (x=='United States of America'):
        map_val='USA'
    else:
        map_val=x
    return map_val


In [3]:
# As we are not considering all the columns for this analysis, we  only take the required columns here
filter_columns_2022=response_2022.filter(regex='^Q19_',axis=1).columns.to_list()+['Q2','Q3','Q4','Q8','Q23','Q24','Q25','Q11','Q16','Q29']+response_2022.filter(regex='^Q33_',axis=1).columns.to_list()+response_2022.filter(regex='^Q28_',axis=1).columns.to_list()+response_2022.filter(regex='^Q6_',axis=1).columns.to_list()+response_2022.filter(regex='^Q7_',axis=1).columns.to_list()+response_2022.filter(regex='^Q44_',axis=1).columns.to_list()+response_2022.filter(regex='^Q12_',axis=1).columns.to_list()+response_2022.filter(regex='^Q13_',axis=1).columns.to_list()+response_2022.filter(regex='^Q14_',axis=1).columns.to_list()+response_2022.filter(regex='^Q15_',axis=1).columns.to_list()+response_2022.filter(regex='^Q35_',axis=1).columns.to_list()+response_2022.filter(regex='^Q36_',axis=1).columns.to_list()
filter_columns_2022=filter_columns_2022+response_2022.filter(regex='^Q39_',axis=1).columns.to_list()+response_2022.filter(regex='^Q34_',axis=1).columns.to_list()+response_2022.filter(regex='^Q40_',axis=1).columns.to_list()+response_2022.filter(regex='^Q41_',axis=1).columns.to_list()+response_2022.filter(regex='^Q42_',axis=1).columns.to_list()+response_2022.filter(regex='^Q43_',axis=1).columns.to_list()+response_2022.filter(regex='^Q37_',axis=1).columns.to_list()+response_2022.filter(regex='^Q21_',axis=1).columns.to_list()+response_2022.filter(regex='^Q31_',axis=1).columns.to_list()+response_2022.filter(regex='^Q38_',axis=1).columns.to_list()+response_2022.filter(regex='^Q17_',axis=1).columns.to_list()+response_2022.filter(regex='^Q22',axis=1).columns.to_list()+response_2022.filter(regex='^Q18_',axis=1).columns.to_list()+response_2022.filter(regex='^Q20_',axis=1).columns.to_list()
filtered_df_2022=response_2022[filter_columns_2022].copy()


# Rename column names 
filtered_df_2022.rename(columns = {'Q43':'TPU','Q16':'Years of ML','Q11':'Years of Programming','Q25':'Size','Q29':'Salary','Q2':'Age','Q3':'Gender','Q24':'Industry','Q4':'Country','Q8':'Highest_Fomal_Edu','Q23':'Current_role','Q22':'MLHub_'}, inplace = True)

# Getting the list of all columns which have multiple choice options and renaming the column names with certain prefix for the ease of understanding
cols_with_underscore=filtered_df_2022.filter(regex='^Q[1-9]*[0-9]_',axis=1).columns.to_list()
prefix=""
for c in cols_with_underscore:
    if "Q6_" in c:
        prefix='DSC_'
    elif "Q42_" in c:
        prefix='MLHard_'
    elif "Q41_" in c:
        prefix='EthicalAL_'
    elif "Q40_" in c:
        prefix='MonitorML_'
    elif "Q37_" in c:
        prefix='ManML_'
    elif "Q7_" in c:
        prefix='MSPP_'
    elif "Q44_" in c:
        prefix='FMS_'
    elif "Q12_" in c:
        prefix='PL_'
    elif "Q13_" in c:
        prefix='IDE_'
    elif "Q14_" in c:
        prefix='HNB_'
    elif "Q15_" in c:
        prefix='Viz_'
    elif "Q17_" in c:
        prefix='MLFW_'
    elif "Q18_" in c:
        prefix='MLAlgo_'
    elif "Q19_" in c:
        prefix='CV_'
    elif "Q20_" in c:
        prefix='NLP_'
    elif "Q21_" in c:
        prefix='PrMW_'
    elif "Q28_" in c:
        prefix='Activity_'
    elif "Q31_" in c:
        prefix='CCP_'
    elif "Q33_" in c:
        prefix='CPr_'
    elif "Q34_" in c:
        prefix='DPr_'
    elif "Q35_" in c:
        prefix='DP_'
    elif "Q36_" in c:
        prefix='BI_'
    elif "Q38_" in c:
        prefix='AutoML_'
    elif "Q39_" in c:
        prefix='MLModel_'
    else:
        prefix=""
    filtered_df_2022.rename(columns = {c:prefix+ filtered_df_2022[c].value_counts().index[0]}, inplace = True)
    

# Removing First row from Dataset as first record contains question text
filtered_df_2022= filtered_df_2022.drop( filtered_df_2022.index[0])
# Removing all the rows that have no current role
df_2022=filtered_df_2022[filtered_df_2022['Current_role'].notnull()]
# Removing 'Currently not employed' and 'Other' role, as we are not considering these
df_2022=df_2022[(df_2022['Current_role']!='Currently not employed') & (df_2022['Current_role']!='Other')] 

# Renaming role for easy ploting on graph
df_2022['Current_role']=df_2022['Current_role'].apply(lambda x: 'Data Analyst' if (x=='Data Analyst (Business, Marketing, Financial, Quantitative, etc)') else ( 'Manager' if (x=='Manager (Program, Project, Operations, Executive-level, etc)') else x) )
df_2022['Current_role']=df_2022['Current_role'].apply(lambda x: 'ML/MLops Engineer' if(x=='Machine Learning/ MLops Engineer') else x)

df_2022['Country']=df_2022['Country'].apply(lambda x:country_name_mapping(x))

#First we take all the columns related to computer vision except 'CV_None'. The for each row, we checked whether any of these columns has 
#value or not. If any of these column has value that means this row is a CV user and updated the 'CV_None' value for this record as 
#'UsesCV'. For other records 'CV_None' is updated with 'None'.
cv_col_list_without_none=[0,1,2,3,4,5,7]
df_2022.loc[df_2022.iloc[:,cv_col_list_without_none].notnull().any(1),'CV_None']='UsesCV'
df_2022.loc[df_2022.iloc[:,cv_col_list_without_none].isnull().all(1),'CV_None']='None' # Updating rows where row have all cv null 

# print(len(df_2022[df_2022['CV_None']=='UsesCV'])) #2324
# print(len(df_2022[df_2022['CV_None']=='None'])) #6120


In [4]:
salary_dict = {'$0-999' : 500,
               '1,000-1,999' : 1500,
               '2,000-2,999' : 2500,
               '3,000-3,999' : 3500, 
               '4,000-4,999' : 4500, 
               '5,000-7,499' : 6250,
               '7,500-9,999' : 8750,
               '10,000-14,999' : 12500,
               '15,000-19,999' : 17500,
               '20,000-24,999' : 22500,
               '25,000-29,999' : 27500,
               '30,000-39,999' : 35000,
               '40,000-49,999' : 45000,
               '50,000-59,999' : 55000,
               '60,000-69,999' : 65000, 
               '70,000-79,999' : 75000,
               '80,000-89,999' : 85000,
               '90,000-99,999' : 95000,
               '100,000-124,999' : 112500, 
               '125,000-149,999' : 137500,
               '150,000-199,999' : 175000,
               '200,000-249,999' : 225000,
               '250,000-299,999' : 275000,
               '300,000-499,999' : 400000,
               '$500,000-999,999' : 750000, 
               '>$1,000,000' : 1000000,
       }

df_2022['SalaryNum']  = df_2022['Salary'] .replace(salary_dict)

yrs_prog=['< 1 years','1-3 years','3-5 years','5-10 years',  '10-20 years', '20+ years', 'I have never written code']
yrs_progIndex = dict(zip(yrs_prog, range(len(yrs_prog))))
yrs_ML=['Under 1 year','1-2 years','2-3 years','3-4 years','4-5 years','5-10 years','10-20 years','I do not use machine learning methods']
yrs_MLIndex = dict(zip(yrs_ML, range(len(yrs_ML))))

In [5]:
#Preparing data of 2021
filter_columns_2021=['Q3','Q5']+response_2021.filter(regex='^Q18_',axis=1).columns.to_list()+['Q2']
filtered_df_2021=response_2021[filter_columns_2021].copy()
filtered_df_2021.rename(columns = {'Q2':'Gender','Q3':'Country','Q5':'Current_role'}, inplace = True)
cols_with_underscore=filtered_df_2021.filter(regex='^Q[1-9]*[0-9]_',axis=1).columns.to_list()
prefix=""
for c in cols_with_underscore:
    if "Q18_" in c:
        prefix='CV_'
    else:
        prefix=""
    filtered_df_2021.rename(columns = {c:prefix+ filtered_df_2021[c].value_counts().index[0]}, inplace = True)
    
# Removing some records
# Removing First row from Dataset
filtered_df_2021= filtered_df_2021.drop( filtered_df_2021.index[0])
# Removing all the rows that have no current role
df_2021=filtered_df_2021[filtered_df_2021['Current_role'].notnull()]
# Removing 'Currently not employed' as we are not considering this
df_2021=df_2021[(df_2021['Current_role']!='Currently not employed') & (df_2021['Current_role']!='Other')] 

# Renaming role for easy ploting on graph
df_2021['Current_role']=df_2021['Current_role'].apply(lambda x: 'Data Analyst' if (x=='Data Analyst (Business, Marketing, Financial, Quantitative, etc)') else ( 'Manager' if (x=='Manager (Program, Project, Operations, Executive-level, etc)') else x) )
df_2021['Current_role']=df_2021['Current_role'].apply(lambda x: 'ML/MLops Engineer' if(x=='Machine Learning/ MLops Engineer') else x)

df_2021['Country']=df_2021['Country'].apply(lambda x:country_name_mapping(x))

cv_col_list_without_none=[2,3,4,5,6,8]
df_2021.loc[df_2021.iloc[:,cv_col_list_without_none].notnull().any(1),'CV_None']='UsesCV'
df_2021.loc[df_2021.iloc[:,cv_col_list_without_none].isnull().all(1),'CV_None']='None' # Updating rows where row have all cv null 

# df_2021.head()

In [6]:
#Preparing data of 2020
filter_columns_2020=['Q3','Q5']+response_2020.filter(regex='^Q18_',axis=1).columns.to_list()+['Q2']
filtered_df_2020=response_2020[filter_columns_2020].copy()
filtered_df_2020.rename(columns = {'Q2':'Gender','Q3':'Country','Q5':'Current_role'}, inplace = True)
cols_with_underscore=filtered_df_2020.filter(regex='^Q[1-9]*[0-9]_',axis=1).columns.to_list()
prefix=""
for c in cols_with_underscore:
    if "Q18_" in c:
        prefix='CV_'
    else:
        prefix=""
    filtered_df_2020.rename(columns = {c:prefix+ filtered_df_2020[c].value_counts().index[0]}, inplace = True)
    
# Removing some records
# Removing First row from Dataset
filtered_df_2020= filtered_df_2020.drop( filtered_df_2020.index[0])
# Removing all the rows that have no current role
df_2020=filtered_df_2020[filtered_df_2020['Current_role'].notnull()]
# Removing 'Currently not employed' as we are not considering this
df_2020=df_2020[(df_2020['Current_role']!='Currently not employed') & (df_2020['Current_role']!='Other')] 

# Renaming role for easy ploting on graph
df_2020['Current_role']=df_2020['Current_role'].apply(lambda x: 'Data Analyst' if (x=='Data Analyst (Business, Marketing, Financial, Quantitative, etc)') else ( 'Manager' if (x=='Manager (Program, Project, Operations, Executive-level, etc)') else x) )
df_2020['Current_role']=df_2020['Current_role'].apply(lambda x: 'ML/MLops Engineer' if(x=='Machine Learning/ MLops Engineer') else x)

df_2020['Country']=df_2020['Country'].apply(lambda x:country_name_mapping(x))

cv_col_list_without_none=[2,3,4,5,6,8]
df_2020.loc[df_2020.iloc[:,cv_col_list_without_none].notnull().any(1),'CV_None']='UsesCV'
df_2020.loc[df_2020.iloc[:,cv_col_list_without_none].isnull().all(1),'CV_None']='None' # Updating rows where row have all cv null 

In [7]:
### Graph Ploting function
def drawSingleBar(p_df,base,title="",pwidth=500,pheight=500):
    
    # p_df is the dataset
    # 'base' is the column based on which the number of computer vision users and non-users is calculated
    
    df=p_df.copy()
    df=df[[base,'CV_None']].copy()
    df=df.groupby([base,'CV_None'])['CV_None'].count().rename('Count') # Getting Industry wise participant count
    df= df.groupby(level=[0]).apply(lambda x:100 * round(x / float(x.sum()),2)).rename('Percentage').reset_index()
    df[base]=df[base].apply(lambda x: industry_name_mapping(x))
    df=df.sort_values(['CV_None','Percentage'], ascending=[False,False])
    df.rename(columns = {'CV_None':'Use of Computer Vision'},inplace=True)
    
    fig = px.bar(df, x=df[base], y=df['Percentage'], color=df['Use of Computer Vision'],color_discrete_sequence=['#B24E8F','#EBACBF'],width=pwidth,height=pheight, text_auto=True)
    fig.update_layout(title_text=title,title_font_color="#7aa64d",legend_title_font_color="#7aa64d")
    fig.show()
    
    
def drawGroupedBar(p_df,base,keyword,keyword_name,title=""):
    # p_df is the dataset
    # 'CV_None' column in p_df indicates whether a respondent uses CV or not .This column name is passed through 'base' parameter
    # 'keyword' is the column name prefix, based on which the number of computer vision users and non-users is calculated. For example 
    # 'Automated Machine Learning Tool' is denoted by column name prefix 'AutoML'. 

    df=p_df.copy()
    df=get_base_keword_wise_df(df,base,keyword,keyword_name,True)
    df[keyword_name]=df[keyword_name].apply(lambda x: cv_name_mapping(x))
    
    No_CV = df[df['CV_None']=='None'].copy()
    No_CV=No_CV.sort_values('Percentage', ascending=False)
    
    Yes_CV = df[df['CV_None']=='UsesCV'].copy()
    Yes_CV=Yes_CV.sort_values('Percentage', ascending=False)
    
    fig = go.Figure()
    fig.add_trace(go.Bar(x=Yes_CV[keyword_name], y=Yes_CV['Percentage'], name='Works on CV',text=round(Yes_CV['Percentage'],2),textposition='inside',marker=dict(color = '#B24E8F')))
    fig.add_trace(go.Bar(x=No_CV[keyword_name], y=No_CV['Percentage'], name='Does not work on CV',text=round(No_CV["Percentage"],2),textposition='inside',marker=dict(color = '#EBACBF')))
    fig.update_layout(title_text=title ,title_y=0.87, title_x=0.5,autosize=False,width=900,height=500,margin=dict(l=50,r=50,b=100,t=100,pad=4),title_font=dict(size=18,color='#7aa64d'))
    fig.show()
    
def drawStackedBar(p_df,base,keyword,keyword_name,title=""):
    # p_df is the dataset
    # 'CV_None' column in p_df indicates whether a respondent uses CV or not .This column name is passed through 'base' parameter
    # 'keyword' is the column name prefix, based on which the number of computer vision users and non-users is calculated. For example 
    # 'Automated Machine Learning Tool' is denoted by column name prefix 'AutoML'. 

    df=p_df.copy()
    df['CV_None']=df['CV_None'].apply(lambda x:'Does not work on CV' if x=='None' else 'Works on CV')
    df=get_base_keword_wise_df(df,base,keyword,keyword_name,True)
    df[keyword_name]=df[keyword_name].apply(lambda x: cv_name_mapping(x))
    df=df.sort_values(['CV_None','Percentage'], ascending=[True,True])
    df.rename(columns = {'CV_None':'Use of Computer Vision'},inplace=True)
            
    fig = px.bar(df, x=df['Percentage'], y=df[keyword_name], color=df['Use of Computer Vision'],color_discrete_sequence=['#EBACBF','#B24E8F'], orientation='h',width=1200, text_auto=True)
    fig.update_layout(title_text=title ,title_y=0.95, title_x=0.5,autosize=False,title_font_color="#7aa64d",legend_title_font_color="#7aa64d")
    fig.show()
    

    
def drawSubplotBar(p_df,base,keyword,keyword_name,title=""):

    df=p_df.copy()
    df=get_base_keword_wise_df(df,base,keyword,keyword_name,True)
    df[keyword_name]=df[keyword_name].apply(lambda x: cv_name_mapping(x))

    No_CV = df[df['CV_None']=='None'].copy()
    No_CV=No_CV.sort_values('Percentage', ascending=True)
    
    Yes_CV = df[df['CV_None']=='UsesCV'].copy()
    Yes_CV=Yes_CV.sort_values('Percentage', ascending=True)
    
    if(title==""):
        title='Popularity of '+keyword_name+' usage(percentage) between who uses CV and does not use CV'
    
    fig = make_subplots(rows=1, cols=2)

    fig.add_trace(go.Bar(x=Yes_CV['Percentage'], y=Yes_CV[keyword_name], name='Works on CV',text=round(Yes_CV['Percentage'],2),orientation='h',textposition='inside',marker=dict(color = '#B24E8F')),row=1, col=1)
    fig.add_trace(go.Bar(x=No_CV['Percentage'], y=No_CV[keyword_name], name='Does not work on CV',text=round(No_CV["Percentage"],2),orientation='h',textposition='inside',marker=dict(color = '#EBACBF')),row=1, col=2)

    fig.update_layout(height=600, width=1300, title_text=title,title_x=0.5,title_font=dict(size=18,color='#7aa64d'))
    fig.show()
    
    
def find_country_loc(country):
    
    import pycountry
    val=pycountry.countries.get(name=country)
    if val is not None:
        r_val=val.alpha_3
    else:
        if country=='USA':
            r_val="USA"
        elif country=='UK':
            r_val="GBR"
        elif country=='Russia':
            r_val="RUS"
        elif country=='South Korea':
            r_val="KOR"
        elif country=='Hong Kong':
            r_val="HKG"
        elif country=='South Korea':
            r_val="KOR"
        elif country=='UAE':
            r_val="ARE"
        elif country=='Taiwan':
            r_val="TWN"
        elif country=='Iran':
            r_val="IRN"
        elif country=='Czech Republic':
            r_val="CZE"
        else:
            r_val=""
            
    return r_val


def drawChoropleth(p_df,base,param_title="",pwidth=900,pheight=500):

    import plotly.express as px
    

    df=p_df.copy()
    df=df[[base,'CV_None']].copy()

    df=df.groupby([base,'CV_None'])['CV_None'].count().rename('Count') # Getting Industry wise participant count
    df= df.groupby(level=[0]).apply(lambda x:100 * round(x / float(x.sum()),2)).rename('Percentage').reset_index()
    df=df.sort_values('Percentage', ascending=False)
    df[base]=df[base].apply(lambda x: industry_name_mapping(x))
    df['Location']=df['Country'].apply(lambda x: find_country_loc(x))
    Yes_CV = df[df['CV_None']=='UsesCV'].copy()
    

    fig = px.choropleth(Yes_CV, locations='Location',color="Percentage",hover_name="Country",color_continuous_scale=px.colors.sequential.Magenta)
    fig.update_layout(title_text=param_title, title_y=0.95, title_x=0.5,autosize=False,width=pwidth,height=pheight,margin=dict(l=50,r=50,b=100,t=100,pad=4),title_font=dict(size=18,color='#7aa64d'))
    fig.show()

def drawCVMethodHeatMap(p_df,base,keyword,keyword_name,title="",pwidth=800,pheight=500):
    
    df=p_df.copy()
    df=df[df['CV_None']=='UsesCV'].copy()
    df.drop(columns=['CV_None'],inplace=True)

    df=get_base_keword_wise_df(df,base,keyword,keyword_name,True)
    df[keyword_name]=df[keyword_name].apply(lambda x: cv_name_mapping(x))
    df.head()

    fig = go.Figure(data=go.Heatmap(z=df['Percentage'],x=df[base],y=df['Computer Vision'],colorscale='Magenta',text=df['Percentage'].apply(lambda x:x if x>15 else ""),texttemplate="%{text}"))
    fig.update_layout(title_text=title, title_y=0.95, title_x=0.5,autosize=False,width=pwidth,height=pheight,margin=dict(l=50,r=50,b=100,t=100,pad=4),title_font=dict(size=18,color='#7aa64d'))
    fig.show()



# Table of Content 

1. [What is Computer Vision and why is it important??](#CV_Def)
2. [Purpose of the Analysis](#CV_Pur)
3. [Overview of the Analysis](#CV_Over)
4. [Who Uses Computer Vision?](#CV_Use)
5. [Benefits of Working on Computer Vision](#CV_Ben)
6. [Preferences of CV users?](#CV_Pref)
7. [Methods the CV users use](#CV_Methods)
8. [Key Findings from the Analysis](#CV_Key)
9. [Conclusion](#CV_Con)
10. [References](#CV_Ref)

<a id='CV_Def'></a>
<h2 class="main-heading">Computer Vision in 2022: Waht is the Current State?? </h2>

<h2 class="sub-heading">What is Computer Vision and why is it important?? </h2>

<p class='default-font'>
    Computer Vision is an exciting branch of computer science for the programmers. Computer vision (CV) is a subset of AI that enables systems to interpret information from digital visual inputs such as photos and videos. The insights gained from computer vision are then used to take automated actions. Just like AI gives computers the ability to ‘think’, computer vision allows them to ‘see’.</p>
<p class='default-font'>
    Computer vision can be defined as the study of extracting information from images. The typical example would be- Extract the number plate text from a car using automatic numberplate recognition (ANPR). Identify the gender of a face in a picture, the emotion it is displaying, or perhaps even try to recognize the individual person. Determine if a vehicle is stuck on the railroad line by examining an image of a level crossing. Calculate the population based on a picture of a train platform. Is the component shown in a manufacturing line image flawless or flawed? In each example, we are attempting to gain some valuable information from the images.<a href='https://www.alvervalleysoftware.com/2016/02/28/computer-vision-a-developers-viewpoint/'>[1]</a></p>
    
<p class='default-font'>Computer vision also has some other great uses<a href='https://www.linkedin.com/pulse/what-computer-vision-why-important-gabriella-leone/'>[2]</a>: </p>
<ul class='default-font'>
  <li>Optical Character Recognition (OCR): Recognizing and identifying text in documents, a scanner does this.</li>
  <li>Vision Biometrics: Recognizing people who have been missing through iris patterns.</li>
  <li>Object Recognition: Great for retail and fashion to find products in real-time based off of an image or scan.</li>
  <li>Special Effects: Motion capture and shape capture, any movie with CGI.</li>
  <li>3-D Printing and Image Capture: Used in movies, architectural structures, and more.</li>
  <li>Sports: In a game when they draw additional lines on the field, yup computer vision.</li>
  <li>Social Media: Anything with a story that allows you to wear something on your face.</li>
  <li>Smart Cars: Through computer vision they can identify objects and humans.</li>
  <li>Medical Imaging: 3D imaging and image guided surgery.</li>
</ul>
<p class='default-font'>It's is a rapidly growing field. It's gaining popularity day by day. </p>


<a id='CV_Pur'></a>

<p class='sub-heading'>Purpose of The Analysis</p>

<p class='default-font'>
More and more developers, engineers, professionals, etc. are getting involved in this field of study as it continues to grow. With the aid of the data from the 2022 Kaggle survey, we want to discover some facts about the state of computer vision today. 
</p>

<p class='default-font'>The whole analysis is divided into two segments:</p>

<ul class='default-font'>
<li>First Segment: Total dataset is divided into two groups: those who utilize computer vision (CV User) and those who do not(Non-CV User). These two groups are then compared based on their choices, preferences, and so on.   
    
</li>
<li>Second Segment: The purpose of this segment is to identify those who utilize 'Computer vision' methods on a regular basis, what are their selections and preferences for using different methods.</li>
</ul>

<p class='default-font'>Throughout the analysis, we are looking for answers to the following categories of questions:


<ul class='default-font'>
<li>Who uses Computer Vision?</li>
<li>Benefits of Working on Computer Vision</li>
<li>Preferences (Tools, Learning Platforms etc) of CV users in comparision with Non-CV users?</li>
<li>Methods the CV users use</li>
</ul>  </p>



<a id='CV_Over'></a>
<p class='sub-heading'>Overview of the Analysis</p>
<p class='default-font'>We are primarily focusing on "professionals" for this analysis, thus we exclude 'students' and those who are 'Currently not employed' and have 'Other' role ,at the moment.This is because we want to gain a better understanding of what is currently going on in the field of 'computer vision' in the industry.</p>

<p class='default-font'>For each visualization, the number of responders was counted, normalized, and converted to percentages. In the whole analysis where CV and Non-CV users are compared, <b style='color:#B24E8F'>Dark color</b> represents those work on computer vision on regular basis(CV User)  and <b style='color:#EBACBF'>Light color</b> represents those who don't work on computer vision(Non-CV User).  For others graphs, <b>colorscale</b> <b style='color:#FF00FF'>Magenta</b> is used. </p>

<a id='CV_Use'></a>
<p class='sub-heading'>Who Uses Computer Vision?</p>

<p class='graph-heading'>Industry</p>

In [8]:
drawSingleBar(df_2022,'Industry','Utilization of Computer Vision in the Industry',700,400)

<p class='default-font'>Computer vision has the potential to boost revenue, save time and money, improve customer experience and automate dangerous work, according to an IDG/Insight survey. The followings are the ways organizations have or are planning to implement CV include <a href='https://www.techrepublic.com/article/report-computer-vision-adoption-expected-to-grow-significantly-in-the-near-future/'>[3]</a>:  </p>
<ul class='default-font'>
<li>Improving security</li>
<li>Improving employee safety</li>
<li>Anomaly defect detection during production/manufacturing</li>
<li>Improving customer experiences</li>
</ul>

<p class='default-font'>On the other hand,with any new technology there are bound to be challenges or concerns inhibiting early investment. According to IDG/Insight survey, organizations across industries reported several common obstacles to adopt computer vision
<a href='https://www.insight.com/en_US/campaigns/hva/insight/early-adopters-see-value-in-computer-vision.html'>[4]</a>    .</p>
<ul class='default-font'>
<li>Security, privacy and/or compliance</li>
<li>Concerns about data overload</li>
<li>Lack of knowledge or capacity to manage</li>
<li>Time to recognize ROI</li>
<li>Cost of technology</li>


</ul>

<p class='default-font'>
These obstacles impact largely in industries for adopting computer vision in their business. 
</p> 
    
<p class='default-font'>From the above graph we see that ,<b>computer vision is most widely used(37%) in the computer/technology</b> industry. This is obvious because the term 'Computer Vision' itself explains that it is closely related to computer/technology.</p>
    
<p class='default-font'>After that, the most use of <b>computer vision is seen in the Academics/Education, Medical/Pharmaceuticles (31%)</b>. The primary driver behind the use of CV in academic and educational settings is innovation, while in the medical and pharmaceutical industries, it is to increase efficiency so that specialists may concentrate on more crucial activities. </p>
    
<p class='default-font'>Online Service/Internet based service industry is also doing well in adopting computer vision(30%).</p>

<p class='default-font'>Transport industry is also doing good (more than 25%). Although computer vision has not yet been widely adopted industry wide, transportation organizations that have invested are realizing gains in terms of safety, customer experience, operational efficiency, sustainability and revenue generation, and are looking to take advantage of further advances in technology in the future. Automation and touchless processes integrated with computer vision greatly enhance transportation services as well. There are some challegnges  of CV adoption in this industry is like cost of adoption,system reliability, data security challenges etc.<a href='https://www.cio.com/article/308384/computer-vision-is-transforming-the-transportation-industry-making-it-safer-more-efficient-and-improving-the-bottom-line.html'>[5]</a></p>

<p class='default-font'>Other industries have less than 25% of respondents using CV on regular basis.</p>

<p class='default-font'><b>The adoption of CV in government sector is relatively low (21%).</b> Any action taken by governments should be done with extreme caution. AI in public services could be at best ineffective and at worst extremely dangerous if it is introduced without the proper consideration for ethics and safety<a href='https://www.oxfordinsights.com/ai-readiness2019'>[6]</a>. This might be one of the main reason that governments are lagging behind when it comes to the application of AI in this sector. As computer vision is a sub-part of AI, for the same reason the use of CV is lagging behind in government sector.</p>

<p class='default-font'><b>The broadcasting industry has the lowest adoption of CV(15%).</b> </p>

<p class='graph-heading'>Company Size</p>

In [9]:
drawSingleBar(df_2022,'Size','Use of Computer Vision by Organization\'s Size',600,400)

<p class='default-font'>Regular usage of computer vision slightly depends upon the company size. </p>
<p class='graph-heading'>Country</p>

In [10]:
# drawSingleBar(df_2022,'Country','Computer Vision by Country',1300,500)
df=df_2022.copy()
drawChoropleth(df,'Country','Use of Computer Vision by Country',1000)


<p class='default-font'>If we look carefully at the above image, we can see two clusters of dark shade. These countries are more advanced in the use of computer vision. One of these clusters consists of some countries in <b>East and South-East Asia</b>: Vietnam,China,Taiwan,South Korea,Nepal,Thailand,Japan. The other group consists of some <b>Western Europe and Northern Africa countries</b>: Belgium, Germany,Netherland,Peru,Ethiopia,Spain,Algeria,France,Tunisia. </p>

<p class='graph-heading'>Current Role</p>

In [11]:
drawSingleBar(df_2022,'Current_role','Computer Vision by Current Role',800)

<p class='default-font'>The graph above provides a comparative picture of the amount of work data science professionals do with computer vision according to their role. This comparative picture is quite consistent with reality.</p>

<ul class='default-font'>
<li>Computer vision is mostly used by <b>ML/MLops Engineer (60%)</b>.Then <b>research scientists </b> work the most with computer vision <b>(41%)</b>.</li>
<li>Among developer architects, data architects, data scientists, and teacher/professor, a sizable number of specialists(almost one-third) work on computer vision. </li>
<li>Data Administrators, Data Analysts and Statisticians work the least work with computer vision . </li>
</ul>
<p class='graph-heading'>Gender</p>

In [12]:
title='Man and Woman in Computer Vision'
fig = make_subplots(rows=1, cols=3)
df=df_2022[df_2022['Gender'].isin(['Man','Woman'])].copy()
df=df[['Gender','CV_None']].copy()
df=df.groupby(['Gender','CV_None'])['CV_None'].count().rename('Count') # Getting Industry wise participant count
df= df.groupby(level=[0]).apply(lambda x:100 * round(x / float(x.sum()),2)).rename('Percentage').reset_index()
df=df.sort_values(['Gender','Percentage'], ascending=True)
df.head()
fig.add_trace(go.Bar(x=df['Gender'], y=df['Percentage'],text=round(df['Percentage'],2),textposition='inside',marker=dict(color = df["Percentage"],colorscale='Magenta_r')),row=1, col=1)

df=df_2021[df_2021['Gender'].isin(['Man','Woman'])].copy()
df=df[['Gender','CV_None']].copy()
df=df.groupby(['Gender','CV_None'])['CV_None'].count().rename('Count') # Getting Industry wise participant count
df= df.groupby(level=[0]).apply(lambda x:100 * round(x / float(x.sum()),2)).rename('Percentage').reset_index()
df=df.sort_values(['Gender','Percentage'], ascending=True)
df.head()
fig.add_trace(go.Bar(x=df['Gender'], y=df['Percentage'],text=round(df['Percentage'],2),textposition='inside',marker=dict(color = df["Percentage"],colorscale='Magenta_r')),row=1, col=2)

df=df_2020[df_2020['Gender'].isin(['Man','Woman'])].copy()
df=df[['Gender','CV_None']].copy()
df=df.groupby(['Gender','CV_None'])['CV_None'].count().rename('Count') # Getting Industry wise participant count
df= df.groupby(level=[0]).apply(lambda x:100 * round(x / float(x.sum()),2)).rename('Percentage').reset_index()
df=df.sort_values(['Gender','Percentage'], ascending=True)
df.head()
fig.add_trace(go.Bar(x=df['Gender'], y=df['Percentage'],text=round(df['Percentage'],2),textposition='inside',marker=dict(color = df["Percentage"],colorscale='Magenta_r')),row=1, col=3)
fig.update_xaxes(title_text='2022', row=1, col=1)
fig.update_xaxes(title_text='2021', row=1, col=2)
fig.update_xaxes(title_text='2020', row=1, col=3)
fig.update_traces(hovertemplate='%{x},%{y}%<extra></extra>', selector=dict(type='bar'))
fig.update_layout(height=400,showlegend=False, width=600, title_text=title,title_font=dict(size=18,color='#7aa64d'))
fig.show()

<p class='default-font'>Women do less computer vision work on regular basis than men.<b>But the gap between man and woman is decreasing.</b> The above graph shows the gap between man and woman working in CV over past three years.In 2020 the gap was 10, in 2021 the gap was 9 and in 2022 it is 8. </p>

<p class='graph-heading'>Years of Experience</p>

In [13]:
title='Computer Vision by \'Years of Programming Experience\' and \'Years of ML Experience\''+'<br><sup><i >Q11:For how many years have you been writing code and/or programming?    Q16:For how many years have you used machine learning methods?</i></sup>'
fig = make_subplots(rows=1, cols=2)
df=df_2022.copy()
df=df[['Years of Programming','CV_None']].copy()
df=df.groupby(['Years of Programming','CV_None'])['CV_None'].count().rename('Count') # Getting Industry wise participant count
df= df.groupby(level=[0]).apply(lambda x:100 * round(x / float(x.sum()),2)).rename('Percentage').reset_index()
df=df.sort_values('Percentage', ascending=False)
Yes_CV = df[df['CV_None']=='UsesCV'].copy()
fig.add_trace(go.Bar(x=Yes_CV['Years of Programming'], y=Yes_CV['Percentage'],text=round(Yes_CV['Percentage'],2),textposition='inside',marker=dict(color = Yes_CV["Percentage"],colorscale='Magenta')),row=1, col=1)

df=df_2022.copy()
df=df[['Years of ML','CV_None']].copy()
df=df.groupby(['Years of ML','CV_None'])['CV_None'].count().rename('Count') # Getting Industry wise participant count
df= df.groupby(level=[0]).apply(lambda x:100 * round(x / float(x.sum()),2)).rename('Percentage').reset_index()
df=df.sort_values('Percentage', ascending=False)
Yes_CV = df[df['CV_None']=='UsesCV'].copy()
fig.add_trace(go.Bar(x=Yes_CV['Years of ML'], y=Yes_CV['Percentage'],text=round(Yes_CV["Percentage"],2),textposition='inside',marker=dict(color = Yes_CV["Percentage"],colorscale='Magenta')),row=1, col=2)

fig.update_xaxes(title_text='Years of Programming Experiecne', row=1, col=1)
fig.update_xaxes(title_text='Years of ML Experiecne', row=1, col=2)
fig.update_traces(hovertemplate='%{x},%{y}%<extra></extra>', selector=dict(type='bar'))
fig.update_layout(height=400,showlegend=False, width=1000, title_text=title,title_font=dict(size=18,color='#7aa64d'))
fig.show()

<ul class='default-font'>
<li><b>Working on computer vision is linerarly correlated with years of programming experience.</b> From the left graph we see the rate of work on computer vision increases along with programming experience.</li>
<li>Years of ML experience and working in computer vision have different relationship. For specialists with 2 to 20 years of ML expertise, the rate of computer vision work is roughly the same.</li>
</ul>

<a id='CV_Ben'></a>
<p class='sub-heading'>Benefits of Working on Computer Vision</p>

In [14]:
df = df_2022.groupby(['Country', 'CV_None']).SalaryNum.mean().round(2).to_frame().reset_index()
df = df.pivot(index='Country', columns='CV_None')['SalaryNum'].reset_index()
df['ratio'] = round(df['UsesCV'] / df['None'],2)
df.rename(columns={'None':'Non CV User','UsesCV':'CV User'},inplace=True)

df.head()
from IPython.display import display, Markdown
display(Markdown("<p class='bold-text'><b>Average ratio of salary between Kagglers using and not using computer vision on a regular basis in their country: "+str(round(df['ratio'].mean(),2)) +"</b></p>" ))

<p class='bold-text'><b>Average ratio of salary between Kagglers using and not using computer vision on a regular basis in their country: 2.37</b></p>

<a id='CV_Pref'></a>
<p class='sub-heading'>Preferences of CV users?</p>

In [15]:
drawSubplotBar(df_2022,'CV_None','PL','Programming Language','Programming Languages'+'<br><i><sup>Q12:What programming languages do you use on a regular basis? </sup></i>')

<p class='default-font'>The graph above compares the most popular languages in these two groups.</p>

<ul class='default-font'>
<li>In these two groups, Python and SQL are the most popular programming languages.</li>
<li><b>Use of C and C++ is higher among CV users than Non-CV users.</b></li>
<li><b>R is more often used by non-CV users than by CV users.</b></li>
<li>Other languages are used less often in both groups.</li>
</ul>

In [16]:
drawGroupedBar(df_2022,'CV_None','CCP','Cloud Computing Platforms','Cloud Computing Platforms'+'<br><i><sup>Q31:Which of the following cloud computing platforms do you use? </sup></i>')

<p class='default-font'>We can infer the following regarding cloud computing from the diagram above-</p>

<ul class='default-font'>
<li><b>AWS, GCP, and Microsoft Azure are the most popular cloud computing platforms</b> in both groups, ranking first, second, and third, respectively.</li>
<li><b>Non-CV users have a higher tendency(17.29%) than CV users (7.99%) to not use cloud computing platforms.</b></li>
<li> Though Alibaba Cloud remains the leader with a 37% market share in China, but from global perspective it's usage not does not exceed 2%.</li> 
</ul>

In [17]:
drawStackedBar(df_2022,'CV_None','AutoML','Auto ML','Auto ML'+'<br><i><sup>Q38:Do you use any of the following automated machine learning tools?</sup></i>')

<p class='default-font'>Automated machine machine learning (AutoML) is the process of using machine learning to solve real-world issues.The entire machine learning workflow is automated with AutoML. For AutoML the use of cloud infrastructures is preferred. By looking for the best hyperparameters and models for the modeling task, AutoML attempts to replace all the manual tuning and model experimentation that modern Data Scientists do.</p>



<ul class='default-font'>
<li><b>Most of the respondents don't use AutoML for their task(Non-CV 75% and CV user 57.04%)</b>. This may be because majority of AutoML tools are designed for most common use-cases. AutoML is incompatible with different datasets since the data are from different sources and are in different formats. As a result, it restricts users'capacities.</li>
<li>One thing is to note here that, <b>CV users use automated machine learning tools more often than Non-CV users.</b> This could be because AutoML solution providers offer solutions for a range of computer vision applications, including image classification, object detection, instance segmentation, and more.</li>
<li><b>Those who use AutoML, mostly use Google Cloud AutoML and then Azure and Amazon Sagemaker.</b></li>

</ul>

In [18]:
drawStackedBar(df_2022,'CV_None','BI','Business Intelligence','Business Intelligence Tools'+'<br><i><sup>Q36:Do you use any of the following business intelligence tools?</sup></i>')

<p class='default-font'></p>

<ul class='default-font'>
<li><b>A sizeable percentage(more than 25%) of the two categories do not use BI.</b></li>
<li>Use of Tableau and Microsoft Power BI is slightly less among CV users.</li>
<li>Google Data Studio is less often used than Tableau and Microsoft Power BI in both groups. </li>
</ul>

<!-- <p class='graph-heading'>Comparison of popular BI Tools among CV and non-CV Users</p> -->

In [19]:
drawSubplotBar(df_2022,'CV_None','MLFW','ML Framework','Machine Learning Frameworks'+'<br><i><sup>Q17Which of the following machine learning frameworks do you use on a regular basis?</sup></i>')

<p class='default-font'>Scikit-learn and tensorflow remains top two most widely used ML framework for both groups. CV users use <b>keras</b> most often than Non-CV users. <b>PyTorch</b> is also more popular among CV users.</p>

In [20]:
drawStackedBar(df_2022,'CV_None','HNB','Hosted Notebooks','Hosted Notebooks'+'<br><i><sup>Q14:Do you use any of the following hosted notebook products?  </sup></i>')

<p class='default-font'><b>Hosted Notebook usage is more common among CV users than Non-CV users.</b> Between the two groups, <b>Colab and Kaggle are the two most preferred</b> hosted notebook services. Both Google subsidiaries Kaggle and Colab offer a nearly identical set of capabilities for developing and implementing data analysis or machine learning applications.Others are very far behind from these two. </p>

In [21]:
df=df_2022.copy()
df=df.rename(columns={'IDE_ Jupyter Notebook': 'IDE_ Jupyter NB', 
                      'IDE_ Visual Studio Code (VSCode) ': 'IDE_VSCode ',
                      'MSPP_Social media platforms (Reddit, Twitter, etc)':'MSPP_Social Media',
                     'MSPP_Video platforms (YouTube, Twitch, etc)':'MSPP_Video PLTF',
                     'MSPP_None / I do not study data science':'MSPP_None'})

df.filter(regex='^IDE_',axis=1).columns.to_list()
drawSubplotBar(df,'CV_None','IDE','IDE','Integrated Development Environment'+'<br><i><sup>Q13:Which of the following integrated development environments (IDE\'s) do you use on a regular basis? </sup></i>')

<p class='default-font'>There is no significant difference between the two groups in terms of IDE use.The order of top four popular IDE for both groups remains same: Jupyter Notebook, Visual Studio Code, Pycharm and JupyterLab. <b>RStudio is  less often use among CV users.</b></p>

In [22]:
df=df_2022.copy()
df=df.rename(columns={'MSPP_University courses': 'MSPP_Uni CRSE', 
                      'MSPP_Online courses (Coursera, EdX, etc)': 'MSPP_Online CRSE',
                      'MSPP_Social media platforms (Reddit, Twitter, etc)':'MSPP_Social Media',
                     'MSPP_Video platforms (YouTube, Twitch, etc)':'MSPP_Video PLTF',
                     'MSPP_None / I do not study data science':'MSPP_None'})

# df.filter(regex='^MSPP_',axis=1).columns.to_list()
drawSubplotBar(df,'CV_None','MSPP','MSPP','Most Helpful Product/Platform'+'<br><i><sup>Q7:What products or platforms did you find to be most helpful when you first started studying data science? </sup></i>')

<p class='default-font'>Kaggle,online courses and video platforms are the top three learning platforms for both groups. Popularity of Kaggle is slightly high among CV users. Kaggle holds the first position among CV users. <b>University courses are less often preferred as learning platform among Kagglers (26.54%).</b></p>

In [23]:
df=df_2022.copy()
df.head(1)
df=df.rename(columns={'FMS_Journal Publications (peer-reviewed journals, conference proceedings, etc)': 'FMS_Journal Pub', 
                      'FMS_Email newsletters (Data Elixir, O\'Reilly Data & AI, etc)': 'FMS_Email newsl','FMS_Slack Communities (ods.ai, kagglenoobs, etc)':'FMS_Slack Comm'})

drawSubplotBar(df,'CV_None','FMS','FMS','Favourite Media Source'+'<br><i><sup>Q44:Who/what are your favorite media sources that report on data science topics? </sup></i>')

<p class='default-font'>Almost the same picture here. Again Kaggle is the first choice as favourite media source among CV users. Both groups' top three favourite media sources are Kaggle, YouTube, and blogs.Another thing is to note here that,<b>those who often utilize computer vision consider journal publications more frequently than those who do not</b>.  </p>

In [24]:
df=df_2022.copy()
df=df.rename(columns={'MLModel_ TensorFlow Extended (TFX) ': 'MLModel_TensorFlow Extn', 
                      'MLModel_ Multi Model Server (MMS) ': 'MLModel_Multi MDL SRV',
                      'MLModel_ Triton Inference Server ':'MLModel_Triton Infern SRV',
                     'MLModel_ OpenVINO Model Server ':'MLModel_OpenVINO SRV'})

# df.filter(regex='^MLModel_',axis=1).columns.to_list()
drawSubplotBar(df,'CV_None','MLModel','ML Model Hubs/Repositories','ML Model Hubs/Repositories'+'<br><i><sup>Q2:Which of the following ML model hubs/repositories do you use most often?</sup></i>')

<p class='default-font'>ML model hubs/repositorities are repositories of trained machine learning models ready for fine-tuning and deployable anywhere. <b>The use of ML model hubs/repositorities are more often among CV users than Non-CV users. Top two repositories are MLFlow and TensorFlow Extnt.</b></p>

In [25]:
drawSubplotBar(df_2022,'CV_None','MLHard','MLHard','Specialized Hardware '+'<br><i><sup>Q42:Do you use any of the following types of specialized hardware when training machine learning models?</sup></i>')

<p class='default-font'><b>Almost half of the Non-CV user do not use any specialized hardware for training their machine learning models while 62.29% of CV user use GPUs.</b> Use of TPUs is also higher among CV users. This indicates that CV users are more in need of using specialized hardware to carry out their regular task. This is obvious as CV users have to work with a lot of image data and working with huge amount of image data requires special computation power.   </p>

In [26]:
drawSubplotBar(df_2022,'CV_None','MonitorML','MonitorML','Tools to monitor ML models'+'<br><i><sup>Q40:Do you use any tools to help monitor your machine learning models and/or experiments?</sup></i>')

<p class='default-font'>Model Monitoring is an operational stage in the machine learning lifecycle that comes after model deployment. Monitoring is a way to track the performance of the model in production. It requires continuously monitoring ML models for changes such as model degradation, data drift, and idea drift, and ensuring that model is performing at an appropriate level <a href='https://medium.com/geekculture/monitoring-your-deployed-machine-learning-models-in-production-8822d1bab11a#a00c'>[7]</a>. The graph above showcases how often data scientists employ model monitoring tools and which tool they prefer. </p>

<p class='default-font'><b>67.65 % of Non-CV users do not use any ML model monitoring tools while this rate is 31.18% amon CV users.</b> Top three popular used ML model monitoring tools among CV users are-TensorBoard, MLFlow and Weights & Biases. Other tools are used not more than 4%.  </p>
<a id='CV_Methods'></a>
<h2 class='sub-heading'>Methods the CV users use</h2>

In [27]:
keyword='CV'
keyword_name='Computer Vision'
title=""
df=df_2022.copy()
df=df[df['CV_None']=='UsesCV'].copy()
df.drop(columns=['CV_None'],inplace=True)

df_sum=get_summary_df(df,keyword,keyword_name,True)
df_sum[keyword_name]=df_sum[keyword_name].apply(lambda x: cv_name_mapping(x))
df_sum=df_sum.sort_values('Percentage', ascending=True)

if(title==""):
        title='Popular Computer Vision methods'+'<br><sup><i >Q19:Which categories of computer vision methods do you use on a regular basis? </i></sup>'

colors=px.colors.sequential.Magenta
fig = go.Figure(data=[go.Pie(labels=df_sum[keyword_name], values=df_sum["Percentage"],textinfo='percent',marker=dict(colors=colors, line=dict(color='#000000', width=2)),insidetextorientation='radial')])
fig.update_layout(title_text=title ,title_y=0.90, title_x=0.5,autosize=False,width=900,height=500,title_font=dict(size=18,color='#7aa64d'))
fig.show()

<p class='default-font'> Among the computer vision methods,<b>CV user mostly use Image classification and other general purpose networks(27.4%)</b>.The adoption of object detection, image segmentation methods, and general image/video tools is almost same (between 17% to 19.4%).Generative Networks are used less often(9.72%).Vision transformer networks are used the least(6.28%). Kagglers also use some other methods for Computer Vision tasks but that percentage is very small(1.23%)  </p>

In [28]:
df=df_2022[df_2022['Gender'].isin(['Man','Woman'])].copy()
drawCVMethodHeatMap(df,'Gender','CV','Computer Vision','Use of computer vision methods between man and woman'+'<br><sup><i >Q3:What is your gender?</i></sup>'+'<br><sup><i >Q19:Which categories of computer vision methods do you use on a regular basis? </i></sup>')

<p class='default-font'>Both men and women mostly use Image Classification/General Networks. <b>Image segmentation method is sligtly more popular in women(21%)</b>.Men tend to use general image/video editing tools more frequently than women.</p>

In [29]:
keyword='CV'
keyword_name='Computer Vision'
df=df_2022.copy()
df=df[df['CV_None']=='UsesCV'].copy()
df.drop(columns=['CV_None'],inplace=True)

df=get_industry_wise_df(df,keyword,keyword_name,True)

df[keyword_name]=df[keyword_name].apply(lambda x: cv_name_mapping(x))
df['Industry']=df['Industry'].apply(lambda x: industry_name_mapping(x))

fig = go.Figure((go.Scatter(x=df['Industry'], y=df[keyword_name], showlegend=False,text=df['Percentage'],
                    mode='markers',name='',marker=dict(color=df['Percentage'],colorscale ='Magenta',colorbar=dict(thickness=10),size=df['Percentage']*1,)))
               )
fig.update_traces(textfont_size=14)
fig.update_xaxes(tickangle=25)
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.update_layout(title_text='Use of Computer Vision Methods in different Industry'+'<br><sup><i >Q24:In what industry is your current employer/contract?</i></sup>'+'<br><sup><i >Q19:Which categories of computer vision methods do you use on a regular basis? </i></sup>', title_y=0.90, 
                  title_x=0.5,autosize=False,width=800,height=500,margin=dict(l=50,r=50,b=100,t=100,pad=4),title_font=dict(size=18,color='#7aa64d'))
fig.show()        

<p class='default-font'>Kagglers from all industries mostly use Image Classification/ General Networks. 
<b>The use of Object Detection in Shipping/Transport is greater than in any other industry(30%).</b> The use of Image Segmentation does not vary that much depending upon the industry.Though GAN is less used by Kagglers regardless of industry, interestingly <b>the use of GAN  in Broadcasting industry is greater than other industries(15%)</b>.</p>

In [30]:
roles=['Data Scientist', 'Research Scientist', 'Data Analyst','ML/MLops Engineer', 'Teacher / professor', 'Data Architect']
df=df_2022[df_2022['Current_role'].isin(roles)]
drawCVMethodHeatMap(df,'Current_role','CV','Computer Vision','Use of Computer Vision Methods in different Role'+'<br><sup><i >Q23:Select the title most similar to your current role:</i></sup>'+'<br><sup><i >Q19:Which categories of computer vision methods do you use on a regular basis? </i></sup>')

<p class='default-font'>Reviewing use of CV methods in light of the current role reveals no different picture. Here the oder of methods based on their use is same as before. But one thing is to note here,Data Analysts use Image Segmentation slightly more often than other roles. Another thing is , <b>use of GAN is significantly high among Data Architects(17.46%)</b> than other roles. </p>

In [31]:
df=df_2022[~df_2022['Highest_Fomal_Edu'].isin(['I prefer not to answer'])].copy()
df['Highest_Fomal_Edu']=df['Highest_Fomal_Edu'].apply(lambda x: 'College/uni study without bachelor' if x=='Some college/university study without earning a bachelor’s degree' else x )
drawCVMethodHeatMap(df,'Highest_Fomal_Edu','CV','Computer Vision','Use of Computer Vision Methods based on Highest Formal Degree'+'<br><sup><i >Q8:What is the highest level of formal education that you have attained or plan to attain within the next 2 years?</i></sup>'+'<br><sup><i >Q19:Which categories of computer vision methods do you use on a regular basis? </i></sup>',1200,600)

<p class='default-font'>The graph above provides a comparison of CV methods used by highest level of formal education. <b>The use of Image Segmentation among Professional Doctorates are relativel high than others (21.04%)</b> and object detection is slightly less used among those who have Doctoral Degree(16.58%).</p>

<a id='CV_Key'></a>
<p class='sub-heading'>Key Findings from the Analysis</p>

<ul class='default-font'>
<p class='ref-heading'>Who Uses Computer Vision?:</p>
<li>Computer/technology, Academics/Education and Medical/Pharmaceuticles are top three industries to use computer vision.</li>
<li>Organization's use of computer vision doesn't that much depend upon company size.</li>
<li>Two clusters can be seen when looking at the world map in terms of CV use.One cluster includes countries from East and South-East Asia and another cluster includes countries from Western Europe and Northern Africa.</li>
<li>Top two roles that mostly use Computer vision are ML/MLops Engineer (60%) and Research Scientists(41%).</li>
<li>Women do less computer vision work on regular basis than men. But reviewing the data from the previous two years and this year reveals that the gender gap is gradually decreasing.</li>
<li>Working on computer vision correlates linearly with programming experience. However, there is no direct correlation between the number of years of ML expertise and computer vision work.</li>
<p class='ref-heading'> Benefits of Working on Computer Vision</p>
<li>Average ratio of salary between Kagglers using and not using computer vision on a regular basis in their country: 2.37.</li>
<p class='ref-heading'> CV users' preferences over non-CV users</p>
<li>Kaggle,online courses and video platforms are the top three learning platforms for both groups. Popularity of Kaggle is slightly high among CV users.</li>
<li>Kaggle is also the first choice as favourite media source among CV users.</li>
<li>Python and SQL are the most popular programming languages among CV and Non-CV users. Use of C and C++ is higher among CV users than Non-CV users.</li>
<li> Scikit-learn and tensorflow remains top two most widely used ML framework for both groups. CV users use keras most often than Non-CV users. PyTorch is also more popular among CV users.</li>
<li>CV users use hosted motebooks most often than Non-CV users. The Google subsidiaries Kaggle and Colab are the two most preferred hosted notebook services.</li>
<li>There is no significant difference between the two groups in terms of IDE use. Jupyter Notebook, Visual Studio Code, Pycharm and JupyterLab are the top three popular IDE.</li>
<li>AWS, GCP, and Microsoft Azure are the most popular cloud computing platforms in both groups among CV and Non-CV users.</li>
<li>Most of the Kaggler don't use AutoML for their task.CV users use automated machine learning tools more often than Non-CV users. The reason might be AutoML solution providers offer solutions for a range of computer vision applications, including image classification, object detection, instance segmentation, and more.</li>
<li>A sizeable percentage(more than 25%) of Kagglers from both categories do not use BI yet. The top two popular BI tools are Tableau and Microsoft Power BI. </li>
    
<li>The use of ML model hubs/repositorities are more often among CV users than Non-CV users. Top two repositories are MLFlow and TensorFlow Extnt.</li>
<li>CV users use more specialized hardware than Non-CV users.Almost half of the Non-CV user do not use any specialized hardware for training their machine learning models while 62.29% of CV user use GPUs. Use of TPUs is also higher among CV users.</li>
<li>67.65 % of Non-CV users do not use any ML model monitoring tools while this rate is 31.18% amon CV users. Top three popular used ML model monitoring tools among CV users are-TensorBoard, MLFlow and Weights & Biases.</li>

<p class='ref-heading'> Methods the CV users use</p>
<li>Among the computer vision methods, CV user mostly use Image classification and other general purpose networks(27.4%).The adoption of object detection, image segmentation methods, and general image/video tools is almost same (between 17% to 19.4%).Generative Networks are used less often(9.72%).Vision transformer networks are used the least(6.28%). Kagglers also use some other methods for Computer Vision tasks but that percentage is very small(1.23%)</li>
<li>Both men and women mostly use Image Classification/General Networks. Image segmentation method is sligtly more popular in women(21%).Men tend to use general image/video editing tools more frequently than women.</li>
<li>Kagglers from all industries mostly use Image Classification/ General Networks. 
The use of Object Detection in Shipping/Transport is greater than in any other industry(30%). The use of Image Segmentation does not vary that much depending upon the industry.The use of GAN  in Broadcasting industry is greater than other industries(15%).</li>
<li>Data Analysts use Image Segmentation slightly more often than other roles and use of GAN is significantly high among Data Architects than other roles.</li>
<li>The use of Image Segmentation among Professional Doctorates is relativel high than others (21.04%) and object detection is slightly less used among those who have Doctoral Degree(16.58%).</li>


</ul>

<a id='CV_Con'></a>
<p class='sub-heading'>Conclusion:</p>
<p class='default-font'>Computer vision is a rapidly growing field in research and applications. Understanding the present maturity levels of the computer vision area across industries and geographies is our goal in selecting this topic. Additionally, we are interested in learning about the preferred tools, technologies, services, platforms, and methodologies among users of computer vision.</p>

<p class='default-font'>We examined this dataset from several angles in order to gain a clear picture of the present level of computer vision. We have seen that computer vision application in business is still relatively low. Apart from <b>tech and academics</b>, the <b>medical</b> sector has seen a lot of use of computer vision in other industries.</p>

<p class='default-font'>We have also noticed that, countries in <b>East and South-East Asia like Vietnam,China,Taiwan,South Korea,Nepal,Thailand,Japan</b> are doing great in computer vision.The reason behind so much improvement in computer vision in these countries is the huge patronage at the government level. </p>

<p class='default-font'>It is also clear that those who work in computer vision on a regular basis make <b>2.37</b> times as much as those who do not.</p>
<p class='default-font'>The preferences for products,tools and technologies between individuals who work in computer vision and those who do not can be compared and contrasted in a number of ways. Use of <b>C,C++(programming languages),PyTorch(ML framework), Kaggle and Colab notebook(hosted notebook)</b> is  higher among CV users than Non-CV users. Use of <b>AutoML ,ML model hubs/repositorities, specialized hardware, ML model monitoring tools</b> is also higher among CV users.</p>

<p class='default-font'>The most popular computer vision methods include <b>image classification and other general-purpose networks</b>. Then, roughly equally, object detection, image segmentation approaches, and generic image/video tools are used.</p>

<p class='default-font'> Finally,we observed that the application of computer vision is still more in research lab than in real life business application. Once we can move from research labs into the real world, computer vision will be able to realize its full potential. We anticipate that computer vision applications will enter a booming phase in the upcoming years. </p>



<a id='CV_Ref'></a>
<p class='sub-heading'>References:</p>
<ol type="1" class='default-font'>
<li><a href='https://www.alvervalleysoftware.com/2016/02/28/computer-vision-a-developers-viewpoint/'>Computer Vision – a Developer’s Viewpoint</a></li>
<li><a href='https://www.linkedin.com/pulse/what-computer-vision-why-important-gabriella-leone/'>What is Computer Vision and Why Is It Important?</a></li>
<li><a href='https://www.techrepublic.com/article/report-computer-vision-adoption-expected-to-grow-significantly-in-the-near-future/'>Report: Computer vision adoption expected to grow significantly in the near future</a></li>
<li><a href='https://www.insight.com/en_US/campaigns/hva/insight/early-adopters-see-value-in-computer-vision.html'>Early Adopters See Value in Computer Vision</a></li>
<li><a href='https://www.cio.com/article/308384/computer-vision-is-transforming-the-transportation-industry-making-it-safer-more-efficient-and-improving-the-bottom-line.html'>Computer Vision Is Transforming the Transportation Industry, Making It Safer, More Efficient and Improving the Bottom Line</a></li>
<li><a href='https://www.oxfordinsights.com/ai-readiness2019'>Government Artificial Intelligence ReadinessIndex 2019</a></li>	
<li><a href='https://medium.com/geekculture/monitoring-your-deployed-machine-learning-models-in-production-8822d1bab11a#a00c'>Monitoring Your Deployed Machine Learning Models in Production</a></li>
    
</ol>