# Filtering Example on stackoverflow Data Set

In [46]:
import pandas as pd
import numpy as np
from IPython.display import display

In [47]:
df = pd.read_csv("data/survey_results_public_2022.csv", index_col="ResponseId")
schema_df = pd.read_csv("data/survey_results_schema.csv", index_col="qname")

In [48]:
pd.set_option("display.max_colwidth", None, "display.max_rows", 80)

In [49]:
## Filter high salary

# underscore is for readability. It doesnt change behaviour
highcomp_filter = df["ConvertedCompYearly"] > 60_000
high_df = df.loc[
    highcomp_filter,
    ["YearsCodePro", "LanguageHaveWorkedWith", "Country", "ConvertedCompYearly"],
]
high_df.head(10)

Unnamed: 0_level_0,YearsCodePro,LanguageHaveWorkedWith,Country,ConvertedCompYearly
ResponseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4,17,C#;JavaScript;SQL;TypeScript,Israel,215232.0
11,2,Bash/Shell;C#;HTML/CSS;JavaScript;PowerShell;SQL,United Kingdom of Great Britain and Northern Ireland,60307.0
12,10,C#;HTML/CSS;JavaScript;PowerShell;Python;Rust;SQL,United States of America,194400.0
13,5,C;HTML/CSS;Rust;SQL;Swift;TypeScript,United States of America,65000.0
15,5,HTML/CSS;JavaScript;PHP;Python;R;Ruby;Scala,United States of America,110000.0
18,10,Python;SQL,Austria,202623.0
23,20,C#;SQL;TypeScript,Canada,97605.0
26,9,Dart;Go;Java;Kotlin;Swift;TypeScript,Germany,90647.0
27,5,Bash/Shell;Groovy;HTML/CSS;Java;JavaScript;SQL,United States of America,106960.0
29,14,C;C++,United States of America,130000.0


In [50]:
## Filter respondents of selected countries
countries = ["Germany", "United States of America", "Canada"]
country_filter = df["Country"].isin(countries)

# For some reason, pandas Series is displayed as plaintext.
# Convert to frame using to_frame() for display purposes.
country_filtd_df = df.loc[country_filter, "Country"]
display(country_filtd_df.head())
display(country_filtd_df.to_frame().head())


# We can also convert to dataframe by placing the Column filter into a list
country_filtd_df = df.loc[country_filter, ["Country"]]
country_filtd_df.head()


ResponseId
2                       Canada
5     United States of America
6                      Germany
12    United States of America
13    United States of America
Name: Country, dtype: object

Unnamed: 0_level_0,Country
ResponseId,Unnamed: 1_level_1
2,Canada
5,United States of America
6,Germany
12,United States of America
13,United States of America


Unnamed: 0_level_0,Country
ResponseId,Unnamed: 1_level_1
2,Canada
5,United States of America
6,Germany
12,United States of America
13,United States of America


In [51]:
## Filter respondents who know Python
# na argument passed means to treat NaN values as False and to not return them.
proglang_filter = df["LanguageHaveWorkedWith"].str.contains("Python", na=False)

x = df.loc[proglang_filter, "LanguageHaveWorkedWith"].head()
x.to_frame()

Unnamed: 0_level_0,LanguageHaveWorkedWith
ResponseId,Unnamed: 1_level_1
3,C#;C++;HTML/CSS;JavaScript;Python
7,C++;HTML/CSS;JavaScript;PHP;Python;TypeScript
12,C#;HTML/CSS;JavaScript;PowerShell;Python;Rust;SQL
15,HTML/CSS;JavaScript;PHP;Python;R;Ruby;Scala
17,C#;Java;PHP;Python;R
