In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

from xgboost import XGBRegressor

In [44]:
def disp_unique_col_val(columns):
    for col in columns:
        uniques = df[col].unique()
        print(col)
        print('=' * 30)
        for unique in uniques:
            print(unique)
        if (col != df.columns[-1]): print('\n')

## Model Import

In [45]:
df = pd.read_csv('assets/data_analyst.csv')

## Model Inspection

In [46]:
df.head()

Unnamed: 0.1,Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors,Easy Apply
0,0,"Data Analyst, Center on Immigration and Justic...",$37K-$66K (Glassdoor est.),Are you eager to roll up your sleeves and harn...,3.2,Vera Institute of Justice\n3.2,"New York, NY","New York, NY",201 to 500 employees,1961,Nonprofit Organization,Social Assistance,Non-Profit,$100 to $500 million (USD),-1,True
1,1,Quality Data Analyst,$37K-$66K (Glassdoor est.),Overview\n\nProvides analytical and technical ...,3.8,Visiting Nurse Service of New York\n3.8,"New York, NY","New York, NY",10000+ employees,1893,Nonprofit Organization,Health Care Services & Hospitals,Health Care,$2 to $5 billion (USD),-1,-1
2,2,"Senior Data Analyst, Insights & Analytics Team...",$37K-$66K (Glassdoor est.),We’re looking for a Senior Data Analyst who ha...,3.4,Squarespace\n3.4,"New York, NY","New York, NY",1001 to 5000 employees,2003,Company - Private,Internet,Information Technology,Unknown / Non-Applicable,GoDaddy,-1
3,3,Data Analyst,$37K-$66K (Glassdoor est.),Requisition NumberRR-0001939\nRemote:Yes\nWe c...,4.1,Celerity\n4.1,"New York, NY","McLean, VA",201 to 500 employees,2002,Subsidiary or Business Segment,IT Services,Information Technology,$50 to $100 million (USD),-1,-1
4,4,Reporting Data Analyst,$37K-$66K (Glassdoor est.),ABOUT FANDUEL GROUP\n\nFanDuel Group is a worl...,3.9,FanDuel\n3.9,"New York, NY","New York, NY",501 to 1000 employees,2009,Company - Private,Sports & Recreation,"Arts, Entertainment & Recreation",$100 to $500 million (USD),DraftKings,True


In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2253 entries, 0 to 2252
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         2253 non-null   int64  
 1   Job Title          2253 non-null   object 
 2   Salary Estimate    2253 non-null   object 
 3   Job Description    2253 non-null   object 
 4   Rating             2253 non-null   float64
 5   Company Name       2252 non-null   object 
 6   Location           2253 non-null   object 
 7   Headquarters       2253 non-null   object 
 8   Size               2253 non-null   object 
 9   Founded            2253 non-null   int64  
 10  Type of ownership  2253 non-null   object 
 11  Industry           2253 non-null   object 
 12  Sector             2253 non-null   object 
 13  Revenue            2253 non-null   object 
 14  Competitors        2253 non-null   object 
 15  Easy Apply         2253 non-null   object 
dtypes: float64(1), int64(2),

## Model Cleaning

### Drop Unused Columns

In [48]:
col_drop_list = [df.columns[0], df.columns[3], df.columns[5], df.columns[14], df.columns[15]]
df = df.drop(columns=col_drop_list)
print(col_drop_list)
disp_unique_col_val(df.columns[1:])

['Unnamed: 0', 'Job Description', 'Company Name', 'Competitors', 'Easy Apply']
Salary Estimate
$37K-$66K (Glassdoor est.)
$46K-$87K (Glassdoor est.)
$51K-$88K (Glassdoor est.)
$51K-$87K (Glassdoor est.)
$59K-$85K (Glassdoor est.)
$43K-$76K (Glassdoor est.)
$60K-$110K (Glassdoor est.)
$41K-$78K (Glassdoor est.)
$45K-$88K (Glassdoor est.)
$73K-$127K (Glassdoor est.)
$84K-$90K (Glassdoor est.)
$27K-$52K (Glassdoor est.)
$42K-$74K (Glassdoor est.)
$77K-$132K (Glassdoor est.)
$98K-$114K (Glassdoor est.)
$48K-$96K (Glassdoor est.)
$26K-$47K (Glassdoor est.)
$31K-$59K (Glassdoor est.)
$47K-$81K (Glassdoor est.)
$43K-$69K (Glassdoor est.)
$49K-$112K (Glassdoor est.)
$30K-$54K (Glassdoor est.)
$55K-$103K (Glassdoor est.)
$37K-$70K (Glassdoor est.)
$57K-$103K (Glassdoor est.)
$35K-$45K (Glassdoor est.)
$42K-$66K (Glassdoor est.)
$65K-$81K (Glassdoor est.)
$113K-$132K (Glassdoor est.)
$42K-$63K (Glassdoor est.)
$60K-$66K (Glassdoor est.)
$73K-$82K (Glassdoor est.)
$67K-$92K (Glassdoor est.)
$42K-

### Replace Values to NaN

In [49]:
df = df.replace(['-1', '-1.0', -1, 'Unknown', 'Unknown / Non-Applicable'], np.nan)
disp_unique_col_val(df.columns)

Job Title
Data Analyst, Center on Immigration and Justice (CIJ)
Quality Data Analyst
Senior Data Analyst, Insights & Analytics Team [Customer Operations]
Data Analyst
Reporting Data Analyst
Business/Data Analyst (FP&A)
Data Science Analyst
Data Analyst, Merchant Health
DATA ANALYST
Senior Data Analyst
Investment Advisory Data Analyst
Sustainability Data Analyst
Clinical Data Analyst
DATA PROGRAMMER/ANALYST
Product Analyst, Data Science
Data Analyst - Intex Developer
Entry Level / Jr. Data Analyst
Data + Business Intelligence Analyst
Data Analyst, Product
Data Analyst Entry Level
Data Science Analyst, Capital Markets
Data Analyst (Games)
Analyst/Associate Global Markets Credit Data Analyst
Data Business Analyst
Data Analyst with Excel/DAX/ PowerBI experience- Fulltime
Data Science Analyst/Engineer
Business Analyst, Data Platforms
Behavioral Data Analyst
Data Analyst -1+ year Contract - NYC
Advertising Data Analyst
Market Data Reporting Analyst
Senior Analyst, Data Science
Senior Data An

Hilliard, OH
Dublin, OH
Charlotte, NC
Mooresville, NC
Huntersville, NC
Fort Mill, SC
Indian Trail, NC
San Francisco, CA
San Rafael, CA
San Mateo, CA
Oakland, CA
Berkeley, CA
Burlingame, CA
Foster City, CA
Marin City, CA
Daly City, CA
South San Francisco, CA
San Bruno, CA
Emeryville, CA
Novato, CA
Millbrae, CA
San Ramon, CA
Hercules, CA
Alameda, CA
Walnut Creek, CA
Carmel, IN
Indianapolis, IN
Whitestown, IN
Jeffersonville, IN
Beech Grove, IN
Lawrence, IN
Redmond, WA
Seattle, WA
Issaquah, WA
Kent, WA
Bellevue, WA
Renton, WA
Kirkland, WA
Athens, GA
Burlingame, KS
Topeka, KS
Denver, CO
Centennial, CO
Boulder, CO
Greenwood Village, Arapahoe, CO
Henderson, CO
Englewood, CO
Lakewood, CO
Lone Tree, CO
Louisville, CO
Aurora, CO
Broomfield, CO
Littleton, CO


Headquarters
New York, NY
McLean, VA
Stamford, CT
London, United Kingdom
nan
Bronx, NY
Phoenix, AZ
Rome, NY
Waltham, MA
Mountain View, CA
Marina del Rey, CA
Sydney, Australia
Warren, MI
Columbus, OH
San Francisco, CA
Pittsburgh, PA
Washingt

### Drop NaN

In [50]:
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1205 entries, 0 to 2252
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Job Title          1205 non-null   object 
 1   Salary Estimate    1205 non-null   object 
 2   Rating             1205 non-null   float64
 3   Location           1205 non-null   object 
 4   Headquarters       1205 non-null   object 
 5   Size               1205 non-null   object 
 6   Founded            1205 non-null   float64
 7   Type of ownership  1205 non-null   object 
 8   Industry           1205 non-null   object 
 9   Sector             1205 non-null   object 
 10  Revenue            1205 non-null   object 
dtypes: float64(2), object(9)
memory usage: 113.0+ KB


### Preparation

In [85]:
x = pd.DataFrame()
y = pd.DataFrame()

#### Y

In [86]:
# Salary
salaries = df[df.columns[1]].copy()
salary_l = []
salary_u = []
for salary in salaries:
    temp = salary.replace(' (Glassdoor est.)', '').replace('$', '').replace('K', '').split('-')
    salary_l.append(int(temp[0]))
    salary_u.append(int(temp[1]))
y['Salary Lower'] = salary_l
y['Salary Upper'] = salary_u
y

Unnamed: 0,Salary Lower,Salary Upper
0,37,66
1,37,66
2,37,66
3,37,66
4,37,66
...,...,...
1200,78,104
1201,78,104
1202,78,104
1203,78,104


#### X

In [100]:
# Size
# sizes = df[df.columns[5]].unique()
# replacement = {}
# for size in sizes:
#     temp = size.replace('to', '~').replace('employees', '')
#     if ('+' in temp):
#         temp = temp.replace('+', "~")
#     replacement[size] = temp
# df[df.columns[5]].replace(replacement, inplace=True)

sizes = df[df.columns[5]].copy()
size_l = []
size_u = []
for size in sizes:
    temp = size.replace(' employees', '').replace('+', '').split(' to ')
    temp
    print(temp)
#     size_l.append(int(temp[0]))
#     size_u.append(int(temp[1]))
size_l
# x['Employee Lower'] = size_l
# x['Employee Upper'] = size_u
# x

['201', '500']
['10000']
['201', '500']
['501', '1000']
['201', '500']
['5001', '10000']
['10000']
['10000']
['201', '500']
['1001', '5000']
['51', '200']
['501', '1000']
['10000']
['10000']
['201', '500']
['51', '200']
['10000']
['10000']
['1001', '5000']
['501', '1000']
['1001', '5000']
['10000']
['1001', '5000']
['201', '500']
['10000']
['1001', '5000']
['51', '200']
['1001', '5000']
['1001', '5000']
['1001', '5000']
['10000']
['1001', '5000']
['1001', '5000']
['501', '1000']
['51', '200']
['10000']
['10000']
['501', '1000']
['51', '200']
['1001', '5000']
['1001', '5000']
['1001', '5000']
['201', '500']
['1001', '5000']
['201', '500']
['1001', '5000']
['10000']
['501', '1000']
['501', '1000']
['10000']
['51', '200']
['10000']
['1001', '5000']
['501', '1000']
['10000']
['1', '50']
['51', '200']
['10000']
['10000']
['201', '500']
['10000']
['201', '500']
['501', '1000']
['10000']
['1001', '5000']
['1001', '5000']
['10000']
['201', '500']
['10000']
['10000']
['10000']
['10000']
['51', 

[]

In [11]:
# Revenue
revenues = df[df.columns[-1]].unique()
replacement = {}
for revenue in revenues:
    temp = revenue.replace('to', '~').replace(' (USD)', '').replace('$', '$\$$')
    if ('Less than' in temp):
        temp = temp.replace('Less than ', '~')
    if ('million' in temp):
        temp = temp.replace('million', 'm')
    if ('billion' in temp):
        temp = temp.replace('billion', 'b')
    if ('+' in temp):
        temp = temp.replace('+', '') + '~'
    replacement[revenue] = temp
df[df.columns[-1]].replace(replacement, inplace=True)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1205 entries, 0 to 2252
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Job Title          1205 non-null   object 
 1   Salary Estimate    1205 non-null   object 
 2   Rating             1205 non-null   float64
 3   Location           1205 non-null   object 
 4   Headquarters       1205 non-null   object 
 5   Employee           1205 non-null   object 
 6   Founded            1205 non-null   float64
 7   Type of ownership  1205 non-null   object 
 8   Industry           1205 non-null   object 
 9   Sector             1205 non-null   object 
 10  Revenue            1205 non-null   object 
dtypes: float64(2), object(9)
memory usage: 113.0+ KB


In [21]:
df.head()

Unnamed: 0,Job Title,Salary Estimate,Rating,Location,Headquarters,Employee,Founded,Type of ownership,Industry,Sector,Revenue
0,"Data Analyst, Center on Immigration and Justic...",$\$$37K - $\$$66K,3.2,"New York, NY","New York, NY",201 ~ 500,1961,Nonprofit Organization,Social Assistance,Non-Profit,$\$$100 ~ $\$$500 m
1,Quality Data Analyst,$\$$37K - $\$$66K,3.8,"New York, NY","New York, NY",10000~,1893,Nonprofit Organization,Health Care Services & Hospitals,Health Care,$\$$2 ~ $\$$5 b
3,Data Analyst,$\$$37K - $\$$66K,4.1,"New York, NY","McLean, VA",201 ~ 500,2002,Subsidiary or Business Segment,IT Services,Information Technology,$\$$50 ~ $\$$100 m
4,Reporting Data Analyst,$\$$37K - $\$$66K,3.9,"New York, NY","New York, NY",501 ~ 1000,2009,Company - Private,Sports & Recreation,"Arts, Entertainment & Recreation",$\$$100 ~ $\$$500 m
7,Data Science Analyst,$\$$37K - $\$$66K,3.7,"New York, NY","New York, NY",201 ~ 500,1914,Company - Private,Insurance Carriers,Insurance,$\$$100 ~ $\$$500 m


### Rename Column

In [13]:
df.rename(columns = {df.columns[5]: 'Employee'}, inplace=True)
df

### Type

In [19]:
df[df.columns[6]] = df[df.columns[6]].astype(int)

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1205 entries, 0 to 2252
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Job Title          1205 non-null   object 
 1   Salary Estimate    1205 non-null   object 
 2   Rating             1205 non-null   float64
 3   Location           1205 non-null   object 
 4   Headquarters       1205 non-null   object 
 5   Employee           1205 non-null   object 
 6   Founded            1205 non-null   int64  
 7   Type of ownership  1205 non-null   object 
 8   Industry           1205 non-null   object 
 9   Sector             1205 non-null   object 
 10  Revenue            1205 non-null   object 
dtypes: float64(1), int64(1), object(9)
memory usage: 113.0+ KB


## Prediction

In [28]:
x = df[[df.columns[2]]]
y = df[df.columns[1]]

In [30]:
lin_reg = LinearRegression()
lin_reg.fit(x, y)

ValueError: could not convert string to float: '$\\$$37K - $\\$$66K '