In [25]:
import pandas as pd
import numpy as np

In [26]:
people = {
    'first': ['Corey', 'Jane', 'John', 'Chris', np.nan, None, 'NA'], 
    'last': ['Schafer', 'Doe', 'Doe', 'Schafer', np.nan, None, 'Missing'], 
    'email': ['CoreyMSchafer@gmail.com', 'JaneDoe@email.com', 'JohnDoe@email.com', None, np.nan, 'Anonymous@email.com', 'NA'],
    'age': ['33', '55', '63', '36', None, None, 'Missing']
}

In [36]:
df = pd.DataFrame(people)

df.replace('NA', np.nan, inplace = True)
df.replace('Missing', np.nan, inplace = True)

In [37]:
df.head(3)

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63


In [38]:
# Drop N/A Values 

df.dropna()

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63


In [39]:
# Conditional Drop If ANY Values Missing

df.dropna(axis = 'index', how = 'any')

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63


In [40]:
# Conditional Drop If ALL Values Missing

df.dropna(axis = 'index', how = 'all')

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
5,,,Anonymous@email.com,


In [41]:
df.dropna(axis = 'index', how = 'any', subset = ['email'])

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
5,,,Anonymous@email.com,


In [42]:
df.dropna(axis = 'index', how = 'all', subset = ['last', 'email'])

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
5,,,Anonymous@email.com,


In [46]:
df.head(15)

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
4,,,,
5,,,Anonymous@email.com,
6,,,,


In [44]:
# Boolean Check of Missing Values 

df.isna()

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,True,True,True,True
5,True,True,False,True
6,True,True,True,True


In [47]:
# Mask Missing Values 

df.fillna('0')

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,0,36
4,0,0,0,0
5,0,0,Anonymous@email.com,0
6,0,0,0,0


In [51]:
# Default np.nan Type

type(np.nan)

float

In [52]:
# Convert Int to Str

df['age'] = df['age'].astype(float)

In [54]:
df.dtypes

first     object
last      object
email     object
age      float64
dtype: object

In [56]:
df['age'].mean()

46.75

# Switch to CSV

In [57]:
import pandas as pd

In [59]:
# Creating Default Missing Values 

na_vals = ['NA', 'Missing']

df = pd.read_csv(r'C:\Users\zacha\Hyperborea\csv files\survey_results_public.csv', na_values = na_vals)

In [60]:
df.head(5)

Unnamed: 0,ResponseId,MainBranch,Employment,Country,US_State,UK_Country,EdLevel,Age1stCode,LearnCode,YearsCode,...,Age,Gender,Trans,Sexuality,Ethnicity,Accessibility,MentalHealth,SurveyLength,SurveyEase,ConvertedCompYearly
0,1,I am a developer by profession,"Independent contractor, freelancer, or self-em...",Slovakia,,,"Secondary school (e.g. American high school, G...",18 - 24 years,Coding Bootcamp;Other online resources (ex: vi...,,...,25-34 years old,Man,No,Straight / Heterosexual,White or of European descent,None of the above,None of the above,Appropriate in length,Easy,62268.0
1,2,I am a student who is learning to code,"Student, full-time",Netherlands,,,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",11 - 17 years,"Other online resources (ex: videos, blogs, etc...",7.0,...,18-24 years old,Man,No,Straight / Heterosexual,White or of European descent,None of the above,None of the above,Appropriate in length,Easy,
2,3,"I am not primarily a developer, but I write co...","Student, full-time",Russian Federation,,,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",11 - 17 years,"Other online resources (ex: videos, blogs, etc...",,...,18-24 years old,Man,No,Prefer not to say,Prefer not to say,None of the above,None of the above,Appropriate in length,Easy,
3,4,I am a developer by profession,Employed full-time,Austria,,,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",11 - 17 years,,,...,35-44 years old,Man,No,Straight / Heterosexual,White or of European descent,I am deaf / hard of hearing,,Appropriate in length,Neither easy nor difficult,
4,5,I am a developer by profession,"Independent contractor, freelancer, or self-em...",United Kingdom of Great Britain and Northern I...,,England,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",5 - 10 years,Friend or family member,17.0,...,25-34 years old,Man,No,,White or of European descent,None of the above,,Appropriate in length,Easy,


In [61]:
df['YearsCode'].head(10)

0    NaN
1      7
2    NaN
3    NaN
4     17
5    NaN
6      3
7      4
8      6
9      7
Name: YearsCode, dtype: object

In [62]:
df['YearsCode'].mean()

TypeError: unsupported operand type(s) for +: 'int' and 'str'

In [64]:
# Convert Str to Float

df['YearsCode'] = df['YearsCode'].astype(float)

ValueError: could not convert string to float: 'Less than 1 year'

In [65]:
# Display All Unique Values 

df['YearsCode'].unique()

array([nan, '7', '17', '3', '4', '6', '16', '12', '15', '10', '40', '9',
       '26', '14', '39', '20', '8', '19', '5', 'Less than 1 year', '22',
       '2', '1', '34', '21', '13', '25', '24', '30', '31', '18', '38',
       'More than 50 years', '27', '41', '42', '35', '23', '28', '11',
       '37', '44', '43', '36', '33', '45', '29', '50', '46', '32', '47',
       '49', '48'], dtype=object)

In [69]:
df['YearsCode'].replace('Less than 1 year', 0, inplace = True)

In [70]:
df['YearsCode'].unique()

array([nan, '7', '17', '3', '4', '6', '16', '12', '15', '10', '40', '9',
       '26', '14', '39', '20', '8', '19', '5', 0, '22', '2', '1', '34',
       '21', '13', '25', '24', '30', '31', '18', '38',
       'More than 50 years', '27', '41', '42', '35', '23', '28', '11',
       '37', '44', '43', '36', '33', '45', '29', '50', '46', '32', '47',
       '49', '48'], dtype=object)

In [71]:
df['YearsCode'].replace('More than 50 years', 0, inplace = True)

In [72]:
df['YearsCode'].unique()

array([nan, '7', '17', '3', '4', '6', '16', '12', '15', '10', '40', '9',
       '26', '14', '39', '20', '8', '19', '5', 0, '22', '2', '1', '34',
       '21', '13', '25', '24', '30', '31', '18', '38', '27', '41', '42',
       '35', '23', '28', '11', '37', '44', '43', '36', '33', '45', '29',
       '50', '46', '32', '47', '49', '48'], dtype=object)

In [73]:
df['YearsCode'] = df['YearsCode'].astype(float)

In [74]:
df['YearsCode'].unique()

array([nan,  7., 17.,  3.,  4.,  6., 16., 12., 15., 10., 40.,  9., 26.,
       14., 39., 20.,  8., 19.,  5.,  0., 22.,  2.,  1., 34., 21., 13.,
       25., 24., 30., 31., 18., 38., 27., 41., 42., 35., 23., 28., 11.,
       37., 44., 43., 36., 33., 45., 29., 50., 46., 32., 47., 49., 48.])

In [75]:
df['YearsCode'].mean()

12.232003527639298

In [76]:
df['YearsCode'].median()

10.0