In [15]:
import pandas as pd
import numpy as np

In [16]:
# Read a CSV file and convert it to a data frame
df = pd.read_csv('Internet_all.csv')

# Find all rows that are exactly the same
duplicated_rows = df.duplicated()

# Remove all duplicated rows
df = df.drop_duplicates()

# Delete rows with empty name columns
df = df.dropna(subset=['name'])

# Show basic statistics for each column in the data box
print(df.describe())
print(df.info())

Summary statistics:
             name   salary             company address experience eduBack  \
count      107052   107052              107052  107052     107052  107052   
unique      52109      678               27403      72          7      11   
top     Java开发工程师  1万-1.5万  软通动力信息技术(集团)股份有限公司   上海-浦东       1-3年      本科   
freq         2030     5795                1233    6252      34903   53379   

       companyType   scale    info  
count       103395  106294  107052  
unique          16       7   78085  
top             民营  20-99人      []  
freq         60415   26582     474  
<class 'pandas.core.frame.DataFrame'>
Int64Index: 107052 entries, 0 to 154235
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   name         107052 non-null  object
 1   salary       107052 non-null  object
 2   company      107052 non-null  object
 3   address      107052 non-null  object
 4   experience   107052 non-null  object
 5   

In [17]:
# Define a function that determines the city name and city id based on the value in the address column
def get_city_info(address):
    city_id = None
    city_name = None
    if '北京' in address:
        city_id = 530
        city_name = '北京'
    elif '上海' in address:
        city_id = 538
        city_name = '上海'
    elif '广州' in address:
        city_id = 763
        city_name = '广州'
    elif '深圳' in address:
        city_id = 765
        city_name = '深圳'
    elif '杭州' in address:
        city_id = 653
        city_name = '杭州'
    return pd.Series({'city_id': city_id, 'city_name': city_name})

# Add city_name and city_id to the data frame using the apply() function and a lambda expression
df[['city_id', 'city_name']] = df['address'].apply(lambda x: get_city_info(x))

In [18]:
df = df.dropna(subset=['city_id'])
df['city_id'] = df['city_id'].astype(int)


In [19]:
# Combine similar education levels
df['eduBack'] = df['eduBack'].replace(['EMBA', 'MBA/EMBA', '硕士'], '硕士')
df['eduBack'] = df['eduBack'].replace(['大专','专科'], '大专')
df['eduBack'] = df['eduBack'].replace(['中专/中技', '中技'], '中专')
df['eduBack'] = df['eduBack'].replace(['高中','初中及以下'], '高中及以下')

# Map education requirements to corresponding codes
edu_map = {
    '学历不限': 0,
    '高中及以下':1,
    '中专':2,
    '大专':3,
    '本科': 4,
    '硕士': 5,
    '博士': 6
}

df['eduBack_code'] = df['eduBack'].map(edu_map)

In [20]:
edu_en = {
    '学历不限': 'No Eduction Required',
    '高中及以下':'High School and below',
    '中专': 'Technical Secondary',
    '大专': 'Junior College',
    '本科': 'Bachelor',
    '硕士': 'Master',
    '博士': 'Doctor'
}
df['eduBack_en'] = df['eduBack'].map(edu_en)
city_en = {530: 'Beijing', 538: 'Shanghai', 763: 'Guangzhou', 765: 'Shenzhen', 653: 'Hangzhou'}
df['city_en'] = df['city_id'].map(city_en)

In [21]:
df['scale'] = df['scale'].str.replace('人', '')
df['scale'] = df['scale'].str.replace('以上', '+')
df['scale'] = df['scale'].str.replace('以下', '-')

# Fill empty or blank values with a default code of 7
df['scale'] = df['scale'].fillna('other')

# Map education requirements to corresponding codes
scale_map = {
    '20-': 0,
    '20-99': 1,
    '100-299': 2,
    '300-499': 3,
    '500-999': 4,
    '1000-9999': 5,
    '10000+': 6,
    'other': 7
}

df['scale_code'] = df['scale'].map(scale_map)

In [22]:
scale_id = {
    0:'Less than 20',
    1:'20-99',
    2:'100-299',
    3:'300-499',
    4:'500-999',
    5:'1000-9999',
    6:'Over 10000',
    7: 'Other'
}
df['scale_en'] = df['scale_code'].map(scale_id)

In [24]:
df['experience'] = df['experience'].str.replace('年', 'year')
df['experience'] = df['experience'].str.replace('以上', '+')
df['experience'] = df['experience'].str.replace('以下', '-')
df['experience'] = df['experience'].replace(['无经验','不限'], 'No Work Experience Required')

exp_map = {
    '1year-': 0,
    '1-3year': 1,
    '3-5year': 2,
    '5-10year': 3,
    '10year+': 4,
    'No Work Experience Required': 5
}

df['exp_code'] = df['experience'].map(exp_map)
print(df['exp_code'].value_counts())
print(df['experience'].value_counts().describe)

1    34903
5    29206
2    29021
3    11508
0     1700
4      714
Name: exp_code, dtype: int64
<bound method NDFrame.describe of 1-3year                        34903
No Work Experience Required    29206
3-5year                        29021
5-10year                       11508
1year-                          1700
10year+                          714
Name: experience, dtype: int64>


In [25]:
exp_id = {
    0:'Less than 1 year',
    1:'1-3 years',
    2:'3-5 years',
    3:'5-10 years',
    4:'Over 10 years',
    5:'No Work Experience Required'
}
df['exp_en'] = df['exp_code'].map(exp_id)

In [26]:
df = df[~df['salary'].isin(['面议']) & 
        ~df['salary'].str.contains('天') &
        ~df['salary'].str.contains('时') &
        ~df['salary'].str.contains('次') &
        ~df['salary'].str.contains('下')]

In [27]:
print(df.info())
print(df.head())
print('Shape:', df.shape)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 102938 entries, 0 to 154235
Data columns (total 18 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   name          102938 non-null  object
 1   salary        102938 non-null  object
 2   company       102938 non-null  object
 3   address       102938 non-null  object
 4   experience    102938 non-null  object
 5   eduBack       102938 non-null  object
 6   companyType   99373 non-null   object
 7   scale         102938 non-null  object
 8   info          102938 non-null  object
 9   city_id       102938 non-null  int64 
 10  city_name     102938 non-null  object
 11  eduBack_code  102938 non-null  int64 
 12  eduBack_en    102938 non-null  object
 13  city_en       102938 non-null  object
 14  scale_code    102938 non-null  int64 
 15  scale_en      102938 non-null  object
 16  exp_code      102938 non-null  int64 
 17  exp_en        102938 non-null  object
dtypes: int64(4), object(14)


In [28]:
def bottom_top(x,num=0):
    """Clean up each row of payroll data to unify the payroll 
    in thousands. Split into two columns and return the specified 
    column with num=0 for the lowest column and num=1 
    for the highest column."""
    if x == '面议' or x.find('天') != -1 or x.find('时') != -1 or x.find('次') != -1 or x.find('下') != -1:
        return np.nan
    elif x.find('/月') != -1:
        x = x.replace('/月', '')
    elif x.endswith('元/月'):
        x = float(x[:-3]) / 1000
    if len(x.split('-')) >1:
        x = x.split('-')[num]
        if x.find('千') != -1:
            x=float(x[0:x.find('千')])
        elif x.find('万') != -1:
            x=float(x[0:x.find('万')])*10
        elif x.find('') != -1:
            x=0
        return x

df=df.drop(index = df[df['salary'] == 0].index )
df['bottom'] = df.salary.apply(lambda x : bottom_top(x,0))
df['top'] = df.salary.apply(lambda x : bottom_top(x,1))
df['avg'] = (df['bottom']+df['top'])/2
df['avg_w'] =df['avg']/10

In [29]:
#Add a new column salary_label to classify the average salary into six categories: <5k,5k-1w,1w-1w5,1w5-2w,2w-5w,>5w.
df['salary_label'] = pd.cut(df.avg, bins=[0, 5, 10, 15,20, 50, max(df.avg)], labels=['<5000', '5000-10000', '10000-15000','15000-20000', '20000-50000', '>50000'])

print(df.salary_label.value_counts()) 

5000-10000     31984
10000-15000    31378
15000-20000    19276
20000-50000    17291
<5000           2421
>50000           540
Name: salary_label, dtype: int64


In [32]:
unique_company_types = df['companyType'].unique()
co_map={'民营':'Private', '上市公司':'Public Company', '股份制企业':'Joint-Stock enterprise', '国企':'State-Owned Enterprise', '合资':'Joint Venture', '外商独资':'Wholly Foreign-Owned', '其他':'Other', '事业单位':'Public Institution','银行':'Bank'}
df['Comp_en']=df['companyType'].map(co_map)


In [33]:
keywords = ['Java开发', 'UI设计师', 'Web前端', 'PHP', 'Python', 'Android', '美工', '深度学习', '算法工程师',
            'Hadoop', 'Node.js', '数据开发', '数据分析师', '数据架构', '人工智能', '区块链', '电气工程师',
            '电子工程师', 'PLC测试工程师', '设备工程师', '硬件工程师', '结构工程师', '工艺工程师', '产品经理',
            '新媒体运营', '运营专员', '淘宝运营', '天猫运营', '产品助理', '产品运营', '淘宝客服', '游戏运营',
            '编辑','开发','java','android','AI','数据','运营','产品',
            'ui','设计','客服','销售','平面','修图','制作','创意','前端','Web','软件','架构','RPA',
            '编程','Data','爬虫','算法','建模','系统','安卓','修图','美术','自然语言','NLP',
            '机器','工程','react','C++','硬件','分析师','软件',"C"]

df['keywords'] = df['name'].apply(lambda x: [keyword for keyword in keywords if keyword.lower() in x.lower()])


In [34]:
# Create classification functions for Internet jobs
def classify_industry(row):
    keywords = row['keywords']
    if any(keyword in keywords for keyword in ['Java开发', 'PHP', 'Python', 'Android', 'Node.js', 'Hadoop', 'react', 'C++', '编程', '开发', 'java', "C",'python', 'hadoop', 'node.js', 'android', 'react', 'C++', '编程', '系统', '安卓', '软件','AI','人工智能','区块链','深度学习','算法工程师','算法','RPA','Web','Web前端','前端','后端','编程','开发']):
        return 'Development'
    elif any(keyword in keywords for keyword in ['电气工程师', '电子工程师', 'PLC测试工程师', '设备工程师', '硬件工程师', '结构工程师', '工艺工程师', '机器', '工程', '硬件']):
        return 'Engineering'
    elif any(keyword in keywords for keyword in ['新媒体运营', '运营专员', '淘宝运营', '天猫运营', '淘宝客服', '游戏运营', '运营','客服','销售']):
        return 'Operations'
    elif any(keyword in keywords for keyword in ['产品经理', '产品助理', '产品运营','产品','分析师']):
        return 'Product'
    elif any(keyword in keywords for keyword in ['UI设计师', '美工','ui','设计','平面','修图','制作','创意','修图','美术']):
        return 'Design'
    elif any(keyword in keywords for keyword in ['数据开发', '数据分析师', '数据架构', '爬虫', '建模', '自然语言', '自然语言','NLP', 'Data', '数据']):
        return 'Data'
    else:
        return 'Other'

# Apply the classification function to create a new categorized field
df['industry_category'] = df.apply(classify_industry, axis=1)


In [37]:
# Calculate the percentage for each job category in Internet Industry
category_counts = df['industry_category'].value_counts()
category_percentages = round(category_counts / len(df) * 100,2)
print(category_percentages)

Operations     25.72
Engineering    24.91
Development    23.54
Other          11.96
Product         6.63
Design          6.12
Data            1.12
Name: industry_category, dtype: float64


In [35]:
print(df.info())
# Save the cleaned data frame as a new CSV file
df.to_csv('Internet_cleaned_file.csv', index=False)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 102938 entries, 0 to 154235
Data columns (total 26 columns):
 #   Column             Non-Null Count   Dtype   
---  ------             --------------   -----   
 0   name               102938 non-null  object  
 1   salary             102938 non-null  object  
 2   company            102938 non-null  object  
 3   address            102938 non-null  object  
 4   experience         102938 non-null  object  
 5   eduBack            102938 non-null  object  
 6   companyType        99373 non-null   object  
 7   scale              102938 non-null  object  
 8   info               102938 non-null  object  
 9   city_id            102938 non-null  int64   
 10  city_name          102938 non-null  object  
 11  eduBack_code       102938 non-null  int64   
 12  eduBack_en         102938 non-null  object  
 13  city_en            102938 non-null  object  
 14  scale_code         102938 non-null  int64   
 15  scale_en           102938 non-null

\begin{table}[h]
\centering
\begin{tabular}{|c|c|c|c|c|}
\hline
\textbf{job type} & \textbf{technical} & \textbf{design} & \textbf{product} & \textbf{operations} \\\
\hline
\textbf{Support} & 0.5052 & 0.0612 & 0.0590 & 0.1702 \\\
\hline
\end{tabular}
\caption{support for job classification}
\label{tab:support-job-categories}
\end{table}

keywords = ['Java开发', 'UI设计师', 'Web前端', 'PHP', 'Python', 'Android', '美工', '深度学习', '算法工程师',
            'Hadoop', 'Node.js', '数据开发', '数据分析师', '数据架构', '人工智能', '区块链', '电气工程师',
            '电子工程师', 'PLC测试工程师', '设备工程师', '硬件工程师', '结构工程师', '工艺工程师', '产品经理',
            '新媒体运营', '运营专员', '淘宝运营', '天猫运营', '产品助理', '产品运营', '淘宝客服', '游戏运营',
            '编辑','开发','java','python','hadoop','node.js','android','ai','AI','数据','运营','产品',
            'ui','设计','客服','销售','平面','修图','平面','制作','创意','前端','Web','软件','架构','RPA',
            '编程','Data','爬虫','算法','建模','系统','安卓','PPT','修图','美术','自然语言','NLP','nlp',
            '机器','工程','react','C++','硬件','分析师','软件']
