In [86]:
import pandas as pd
import numpy as np

In [87]:
# Read a CSV file and convert it to a data frame
df = pd.read_csv('Internet_all.csv')

# Find all rows that are exactly the same
duplicated_rows = df.duplicated()

# Remove all duplicated rows
df = df.drop_duplicates()

# Delete rows with empty name columns
df = df.dropna(subset=['name'])

# Show basic statistics for each column in the data box
print(df.describe())
print(df.info())

             name   salary             company address experience eduBack  \
count      107052   107052              107052  107052     107052  107052   
unique      52109      678               27403      72          7      11   
top     Java开发工程师  1万-1.5万  软通动力信息技术(集团)股份有限公司   上海-浦东       1-3年      本科   
freq         2030     5795                1233    6252      34903   53379   

       companyType   scale    info  
count       103395  106294  107052  
unique          16       7   78085  
top             民营  20-99人      []  
freq         60415   26582     474  
<class 'pandas.core.frame.DataFrame'>
Int64Index: 107052 entries, 0 to 154235
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   name         107052 non-null  object
 1   salary       107052 non-null  object
 2   company      107052 non-null  object
 3   address      107052 non-null  object
 4   experience   107052 non-null  object
 5   eduBack      107052 

In [88]:
# Define a function that determines the city name and city id based on the value in the address column
def get_city_info(address):
    city_id = None
    city_name = None
    if '北京' in address:
        city_id = 530
        city_name = '北京'
    elif '上海' in address:
        city_id = 538
        city_name = '上海'
    elif '广州' in address:
        city_id = 763
        city_name = '广州'
    elif '深圳' in address:
        city_id = 765
        city_name = '深圳'
    elif '杭州' in address:
        city_id = 653
        city_name = '杭州'
    return pd.Series({'city_id': city_id, 'city_name': city_name})

# Add city_name and city_id to the data frame using the apply() function and a lambda expression
df[['city_id', 'city_name']] = df['address'].apply(lambda x: get_city_info(x))

In [89]:
df = df.dropna(subset=['city_id'])
df['city_id'] = df['city_id'].astype(int)

In [90]:
# Combine similar education levels
df['eduBack'] = df['eduBack'].replace(['EMBA', 'MBA/EMBA', '硕士'], '硕士')
df['eduBack'] = df['eduBack'].replace(['大专','专科'], '大专')
df['eduBack'] = df['eduBack'].replace(['中专/中技', '中技'], '中专')
df['eduBack'] = df['eduBack'].replace(['高中','初中及以下'], '高中及以下')

# Map education requirements to corresponding codes
edu_map = {
    '学历不限': 0,
    '高中及以下':1,
    '中专':2,
    '大专':3,
    '本科': 4,
    '硕士': 5,
    '博士': 6
}

df['eduBack_code'] = df['eduBack'].map(edu_map)

In [91]:
edu_en = {
    '学历不限': 'No Eduction Required',
    '高中及以下':'High School and below',
    '中专': 'Technical Secondary',
    '大专': 'Junior College',
    '本科': 'Bachelor',
    '硕士': 'Master',
    '博士': 'Doctor'
}
df['eduBack_en'] = df['eduBack'].map(edu_en)

In [92]:
# Calculate value counts and percentages
edu_count = df['eduBack_en'].value_counts()
edu_percentages = round((edu_count / edu_count.sum()) * 100,2)

edu_summary = pd.DataFrame({'Count': edu_count, 'Percentage': edu_percentages})
edu_summary.reset_index(inplace=True)
edu_summary.columns = ['eduBack_en', 'Count', 'Percentage']

print(edu_summary)

              eduBack_en  Count  Percentage
0               Bachelor  53379       49.86
1         Junior College  34666       32.38
2   No Eduction Required   8508        7.95
3                 Master   5296        4.95
4    Technical Secondary   2494        2.33
5  High School and below   2316        2.16
6                 Doctor    393        0.37


In [93]:
city_en = {530: 'Beijing', 538: 'Shanghai', 763: 'Guangzhou', 765: 'Shenzhen', 653: 'Hangzhou'}
df['city_en'] = df['city_id'].map(city_en)

In [94]:
df['scale'] = df['scale'].str.replace('人', '')
df['scale'] = df['scale'].str.replace('以上', '+')
df['scale'] = df['scale'].str.replace('以下', '-')

# Fill empty or blank values with a default code of 7
df['scale'] = df['scale'].fillna('other')

# Map education requirements to corresponding codes
scale_map = {
    '20-': 0,
    '20-99': 1,
    '100-299': 2,
    '300-499': 3,
    '500-999': 4,
    '1000-9999': 5,
    '10000+': 6,
    'other': 7
}

df['scale_code'] = df['scale'].map(scale_map)

In [95]:
# Calculate value counts and percentages
sca_count = df['scale'].value_counts()
sca_percentages = round((sca_count / sca_count.sum()) * 100,2)

sca_summary = pd.DataFrame({'Count': sca_count, 'Percentage': sca_percentages})
sca_summary.reset_index(inplace=True)
sca_summary.columns = ['scale', 'Count', 'Percentage']

print(sca_summary)

       scale  Count  Percentage
0      20-99  26582       24.83
1  1000-9999  24783       23.15
2    100-299  23938       22.36
3     10000+  11107       10.38
4    500-999  10080        9.42
5    300-499   5042        4.71
6        20-   4762        4.45
7      other    758        0.71


In [96]:
scale_id = {
    0:'Less than 20',
    1:'20-99',
    2:'100-299',
    3:'300-499',
    4:'500-999',
    5:'1000-9999',
    6:'Over 10000',
    7: 'Other'
}
df['scale_en'] = df['scale_code'].map(scale_id)

In [97]:
df['experience'] = df['experience'].str.replace('年', 'year')
df['experience'] = df['experience'].str.replace('以上', '+')
df['experience'] = df['experience'].str.replace('以下', '-')
df['experience'] = df['experience'].replace(['无经验','不限'], 'No Work Experience Required')

exp_map = {
    '1year-': 0,
    '1-3year': 1,
    '3-5year': 2,
    '5-10year': 3,
    '10year+': 4,
    'No Work Experience Required': 5
}

df['exp_code'] = df['experience'].map(exp_map)

In [98]:
exp_id = {
    0:'Less than 1 year',
    1:'1-3 years',
    2:'3-5 years',
    3:'5-10 years',
    4:'Over 10 years',
    5:'No Work Experience Required'
}
df['exp_en'] = df['exp_code'].map(exp_id)

In [99]:
# Calculate value counts and percentages
exp_count = df['exp_en'].value_counts()
exp_percentages = round((exp_count / exp_count.sum()) * 100,2)

exp_summary = pd.DataFrame({'Count': exp_count, 'Percentage': exp_percentages})
exp_summary.reset_index(inplace=True)
exp_summary.columns = ['exp_en', 'Count', 'Percentage']

print(exp_summary)

                        exp_en  Count  Percentage
0                    1-3 years  34903       32.60
1  No Work Experience Required  29206       27.28
2                    3-5 years  29021       27.11
3                   5-10 years  11508       10.75
4             Less than 1 year   1700        1.59
5                Over 10 years    714        0.67


In [100]:
df = df[~df['salary'].isin(['面议']) & 
        ~df['salary'].str.contains('天') &
        ~df['salary'].str.contains('时') &
        ~df['salary'].str.contains('次') &
        ~df['salary'].str.contains('下')]

In [101]:
def bottom_top(x,num=0):
    """Clean up each row of payroll data to unify the payroll 
    in thousands. Split into two columns and return the specified 
    column with num=0 for the lowest column and num=1 
    for the highest column."""
    if x == '面议' or x.find('天') != -1 or x.find('时') != -1 or x.find('次') != -1 or x.find('下') != -1:
        return np.nan
    elif x.find('/月') != -1:
        x = x.replace('/月', '')
    elif x.endswith('元/月'):
        x = float(x[:-3]) / 1000
    if len(x.split('-')) >1:
        x = x.split('-')[num]
        if x.find('千') != -1:
            x=float(x[0:x.find('千')])
        elif x.find('万') != -1:
            x=float(x[0:x.find('万')])*10
        elif x.find('') != -1:
            x=0
        return x

df=df.drop(index = df[df['salary'] == 0].index )
df['bottom'] = df.salary.apply(lambda x : bottom_top(x,0))
df['top'] = df.salary.apply(lambda x : bottom_top(x,1))
df['avg'] = (df['bottom']+df['top'])/2
df['avg_w'] =df['avg']/10

In [102]:
#Add a new column salary_label to classify the average salary into six categories: <5k,5k-1w,1w-1w5,1w5-2w,2w-5w,>5w.
df['salary_label'] = pd.cut(df.avg, bins=[0, 5, 10, 15,20, 50, max(df.avg)], labels=['<5000', '5000-10000', '10000-15000','15000-20000', '20000-50000', '>50000'])

In [103]:
# Calculate value counts and percentages
sal_count = df['salary_label'].value_counts()
sal_percentages = round((sal_count / sal_count.sum()) * 100,2)

sal_summary = pd.DataFrame({'Count': sal_count, 'Percentage': sal_percentages})
sal_summary.reset_index(inplace=True)
sal_summary.columns = ['sal_en', 'Count', 'Percentage']

print(sal_summary)

        sal_en  Count  Percentage
0   5000-10000  31984       31.09
1  10000-15000  31378       30.50
2  15000-20000  19276       18.73
3  20000-50000  17291       16.81
4        <5000   2421        2.35
5       >50000    540        0.52


In [104]:
unique_company_types = df['companyType'].unique()
co_map={'民营':'Private', '上市公司':'Public Company', '股份制企业':'Joint-Stock enterprise', '国企':'State-Owned Enterprise', '合资':'Joint Venture', '外商独资':'Wholly Foreign-Owned', '其他':'Other', '事业单位':'Public Institution','银行':'Bank'}
df['Comp_en']=df['companyType'].map(co_map)


In [105]:
keywords = ['Java开发', 'UI设计师', 'Web前端', 'PHP', 'Python', 'Android', '美工', '深度学习', '算法工程师',
            'Hadoop', 'Node.js', '数据开发', '数据分析师', '数据架构', '人工智能', '区块链', '电气工程师',
            '电子工程师', 'PLC测试工程师', '设备工程师', '硬件工程师', '结构工程师', '工艺工程师', '产品经理',
            '新媒体运营', '运营专员', '淘宝运营', '天猫运营', '产品助理', '产品运营', '淘宝客服', '游戏运营',
            '编辑','开发','java','android','AI','数据','运营','产品',
            'ui','设计','客服','销售','平面','修图','制作','创意','前端','Web','软件','架构','RPA',
            '编程','Data','爬虫','算法','建模','系统','安卓','修图','美术','自然语言','NLP',
            '机器','工程','react','C++','硬件','分析师','软件',"C"]

df['keywords'] = df['name'].apply(lambda x: [keyword for keyword in keywords if keyword.lower() in x.lower()])


In [106]:
# Create classification functions for Internet jobs
def classify_industry(row):
    keywords = row['keywords']
    if any(keyword in keywords for keyword in ['Java开发', 'PHP', 'Python', 'Android', 'Node.js', 'Hadoop', 'react', 'C++', '编程', '开发', 'java', "C",'python', 'hadoop', 'node.js', 'android', 'react', 'C++', '编程', '系统', '安卓', '软件','AI','人工智能','区块链','深度学习','算法工程师','算法','RPA','Web','Web前端','前端','后端','编程','开发']):
        return 'Development'
    elif any(keyword in keywords for keyword in ['电气工程师', '电子工程师', 'PLC测试工程师', '设备工程师', '硬件工程师', '结构工程师', '工艺工程师', '机器', '工程', '硬件']):
        return 'Engineering'
    elif any(keyword in keywords for keyword in ['新媒体运营', '运营专员', '淘宝运营', '天猫运营', '淘宝客服', '游戏运营', '运营','客服','销售']):
        return 'Operations'
    elif any(keyword in keywords for keyword in ['产品经理', '产品助理', '产品运营','产品','分析师']):
        return 'Product'
    elif any(keyword in keywords for keyword in ['UI设计师', '美工','ui','设计','平面','修图','制作','创意','修图','美术']):
        return 'Design'
    elif any(keyword in keywords for keyword in ['数据开发', '数据分析师', '数据架构', '爬虫', '建模', '自然语言', '自然语言','NLP', 'Data', '数据']):
        return 'Data'
    else:
        return 'Other'

df['industry_category'] = df.apply(classify_industry, axis=1)


In [107]:
# Calculate value counts and percentages
cat_count = df['industry_category'].value_counts()
cat_percentages = round((cat_count / cat_count.sum()) * 100,2)

cat_summary = pd.DataFrame({'Count': cat_count, 'Percentage': cat_percentages})
cat_summary.reset_index(inplace=True)
cat_summary.columns = ['cat_en', 'Count', 'Percentage']

print(cat_summary)

        cat_en  Count  Percentage
0   Operations  26473       25.72
1  Engineering  25645       24.91
2  Development  24233       23.54
3        Other  12316       11.96
4      Product   6820        6.63
5       Design   6301        6.12
6         Data   1150        1.12


In [85]:
print(df.info())
# Save the cleaned data frame as a new CSV file
df.to_csv('Internet_cleaned_file.csv', index=False)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 107052 entries, 0 to 154235
Data columns (total 26 columns):
 #   Column             Non-Null Count   Dtype   
---  ------             --------------   -----   
 0   name               107052 non-null  object  
 1   salary             107052 non-null  object  
 2   company            107052 non-null  object  
 3   address            107052 non-null  object  
 4   experience         107052 non-null  object  
 5   eduBack            107052 non-null  object  
 6   companyType        103395 non-null  object  
 7   scale              107052 non-null  object  
 8   info               107052 non-null  object  
 9   city_id            107052 non-null  int64   
 10  city_name          107052 non-null  object  
 11  eduBack_code       107052 non-null  int64   
 12  eduBack_en         107052 non-null  object  
 13  city_en            107052 non-null  object  
 14  scale_code         107052 non-null  int64   
 15  scale_en           107052 non-null