In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import re

In [3]:
data = pd.read_csv('alldata.csv')

In [4]:
df = pd.DataFrame(data)

In [5]:
df.head()

Unnamed: 0,position,company,description,reviews,location
0,Development Director,ALS TDI,Development Director\nALS Therapy Development ...,,"Atlanta, GA 30301"
1,An Ostentatiously-Excitable Principal Research...,The Hexagon Lavish,"Job Description\n\n""The road that leads to acc...",,"Atlanta, GA"
2,Data Scientist,Xpert Staffing,"Growing company located in the Atlanta, GA are...",,"Atlanta, GA"
3,Data Analyst,Operation HOPE,DEPARTMENT: Program OperationsPOSITION LOCATIO...,44.0,"Atlanta, GA 30303"
4,Assistant Professor -TT - Signal Processing & ...,Emory University,DESCRIPTION\nThe Emory University Department o...,550.0,"Atlanta, GA"


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6964 entries, 0 to 6963
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   position     6953 non-null   object 
 1   company      6953 non-null   object 
 2   description  6953 non-null   object 
 3   reviews      5326 non-null   float64
 4   location     6953 non-null   object 
dtypes: float64(1), object(4)
memory usage: 272.2+ KB


1 Location

In [7]:
df['location'].unique

<bound method Series.unique of 0        Atlanta, GA 30301 
1               Atlanta, GA
2               Atlanta, GA
3        Atlanta, GA 30303 
4               Atlanta, GA
               ...         
6959          Sunnyvale, CA
6960          Sunnyvale, CA
6961          Sunnyvale, CA
6962          Sunnyvale, CA
6963    Sunnyvale, CA 94089
Name: location, Length: 6964, dtype: object>

In [8]:
df['city,state']=df['location'].str.extract(r'([\D]+ ?[\D]+, \D\D)')

In [9]:
df['city,state']

0         Atlanta, GA
1         Atlanta, GA
2         Atlanta, GA
3         Atlanta, GA
4         Atlanta, GA
            ...      
6959    Sunnyvale, CA
6960    Sunnyvale, CA
6961    Sunnyvale, CA
6962    Sunnyvale, CA
6963    Sunnyvale, CA
Name: city,state, Length: 6964, dtype: object

In [10]:
df['state'] = df['city,state'].str[-2:]

In [11]:
df['state']

0       GA
1       GA
2       GA
3       GA
4       GA
        ..
6959    CA
6960    CA
6961    CA
6962    CA
6963    CA
Name: state, Length: 6964, dtype: object

In [12]:
df['city'] = df['city,state'].str[:-4]
df['city']

0         Atlanta
1         Atlanta
2         Atlanta
3         Atlanta
4         Atlanta
          ...    
6959    Sunnyvale
6960    Sunnyvale
6961    Sunnyvale
6962    Sunnyvale
6963    Sunnyvale
Name: city, Length: 6964, dtype: object

2 Position

In [13]:
df['position'].value_counts()

Data Scientist                                                             351
Senior Data Scientist                                                       96
Research Analyst                                                            64
Data Engineer                                                               60
Machine Learning Engineer                                                   56
                                                                          ... 
Research Analyst, Production Technology & Business Development Strategy      1
Engineering Scientist - R and D User Interface Software Developer            1
Software Engineer / Research Scientist - Question Answering                  1
Analytical Linguist, Natural Language Understanding (NLU) - Amazon AI        1
Research Associate/Senior Research Associate, Process Development            1
Name: position, Length: 5242, dtype: int64

In [14]:
df = df.apply(lambda x: x.astype(str).str.lower())

In [15]:
df['data scientist'] = df['position'].str.findall('data sci').str[0]
df['data scientist'].value_counts() 

data sci    1444
Name: data scientist, dtype: int64

In [16]:
df['data analyst'] = df['position'].str.findall('data analy').str[0]
df['data analyst'].value_counts() 

data analy    191
Name: data analyst, dtype: int64

In [17]:
df['machine learning'] = df['position'].str.findall('machine learning').str[0]
df['machine learning'].value_counts() 

machine learning    528
Name: machine learning, dtype: int64

In [18]:
df['research'] = df['position'].str.findall('research').str[0]

In [19]:
df['data engineer'] = df['position'].str.findall('data engineer').str[0]

In [20]:
df = df.drop(['location', 'city,state'], axis=1)

3 Description

In [21]:
df['work experience'] = df['description'].str.findall(r'\d+\s\byears\b|[\d]+\S?[\d]+\s\byears\b').str[0]
df['work experience'].value_counts()

5 years        406
2 years        299
3 years        271
3-5 years      146
4 years        141
              ... 
0-6 years        1
80 years         1
8-12 years       1
10-20 years      1
6-7 years        1
Name: work experience, Length: 103, dtype: int64

In [22]:
df['data scientist'] = df['data scientist'].fillna(False)
df['data analyst'] = df['data analyst'].fillna(False)
df['machine learning'] = df['machine learning'].fillna(False)
df['research'] = df['research'].fillna(False)
df['data engineer'] = df['data engineer'].fillna(False)
df['reviews'] = df['reviews'].fillna(0)

In [23]:
df['data scientist'] = df['data scientist'].replace({'data sci': True})
df['data analyst'] = df['data analyst'].replace({'data analy': True})
df['machine learning'] = df['machine learning'].replace({'machine learning': True})
df['research'] = df['research'].replace({'research': True})
df['data engineer'] = df['data engineer'].replace({'data engineer': True})


In [24]:
df.head()

Unnamed: 0,position,company,description,reviews,state,city,data scientist,data analyst,machine learning,research,data engineer,work experience
0,development director,als tdi,development director\nals therapy development ...,,ga,atlanta,False,False,False,False,False,6-8 years
1,an ostentatiously-excitable principal research...,the hexagon lavish,"job description\n\n""the road that leads to acc...",,ga,atlanta,False,False,False,True,False,
2,data scientist,xpert staffing,"growing company located in the atlanta, ga are...",,ga,atlanta,True,False,False,False,False,
3,data analyst,operation hope,department: program operationsposition locatio...,44.0,ga,atlanta,False,True,False,False,False,25 years
4,assistant professor -tt - signal processing & ...,emory university,description\nthe emory university department o...,550.0,ga,atlanta,False,False,True,False,False,


4 Tech Centers

In [25]:
tech_centers = ['Austin', 'Atlanta', 'San Fransisco', 'New York City', 'San Diego', 'Boston', 'Portland', 'Seattle']

In [27]:
df = df.drop(['description'], axis=1)

In [30]:
df.to_csv('wranglin crew clean data 2.csv', index=False)

In [31]:
desc = df.describe()
desc.to_csv('description.csv', index=False)