In [1]:
import pandas as pd

df = pd.read_csv('survey.csv')
df.shape #gives us rows and cols count

(64461, 61)

In [2]:
df.info() #if you need more info about DATA TYPES (strings, ints etc)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64461 entries, 0 to 64460
Data columns (total 61 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Respondent                    64461 non-null  int64  
 1   MainBranch                    64162 non-null  object 
 2   Hobbyist                      64416 non-null  object 
 3   Age                           45446 non-null  float64
 4   Age1stCode                    57900 non-null  object 
 5   CompFreq                      40069 non-null  object 
 6   CompTotal                     34826 non-null  float64
 7   ConvertedComp                 34756 non-null  float64
 8   Country                       64072 non-null  object 
 9   CurrencyDesc                  45472 non-null  object 
 10  CurrencySymbol                45472 non-null  object 
 11  DatabaseDesireNextYear        44070 non-null  object 
 12  DatabaseWorkedWith            49537 non-null  object 
 13  D

In [3]:
# once we know how many rows x cols we've got, 
# we might tweak options to show all cols for example:
pd.set_option('display.max_columns', 61) # max 61 cols now, check df.head(1)
df.head(1) # here we go:

Unnamed: 0,Respondent,MainBranch,Hobbyist,Age,Age1stCode,CompFreq,CompTotal,ConvertedComp,Country,CurrencyDesc,CurrencySymbol,DatabaseDesireNextYear,DatabaseWorkedWith,DevType,EdLevel,Employment,Ethnicity,Gender,JobFactors,JobSat,JobSeek,LanguageDesireNextYear,LanguageWorkedWith,MiscTechDesireNextYear,MiscTechWorkedWith,NEWCollabToolsDesireNextYear,NEWCollabToolsWorkedWith,NEWDevOps,NEWDevOpsImpt,NEWEdImpt,NEWJobHunt,NEWJobHuntResearch,NEWLearn,NEWOffTopic,NEWOnboardGood,NEWOtherComms,NEWOvertime,NEWPurchaseResearch,NEWPurpleLink,NEWSOSites,NEWStuck,OpSys,OrgSize,PlatformDesireNextYear,PlatformWorkedWith,PurchaseWhat,Sexuality,SOAccount,SOComm,SOPartFreq,SOVisitFreq,SurveyEase,SurveyLength,Trans,UndergradMajor,WebframeDesireNextYear,WebframeWorkedWith,WelcomeChange,WorkWeekHrs,YearsCode,YearsCodePro
0,1,I am a developer by profession,Yes,,13,Monthly,,,Germany,European Euro,EUR,Microsoft SQL Server,Elasticsearch;Microsoft SQL Server;Oracle,"Developer, desktop or enterprise applications;...","Master’s degree (M.A., M.S., M.Eng., MBA, etc.)","Independent contractor, freelancer, or self-em...",White or of European descent,Man,"Languages, frameworks, and other technologies ...",Slightly satisfied,I am not interested in new job opportunities,C#;HTML/CSS;JavaScript,C#;HTML/CSS;JavaScript,.NET Core;Xamarin,.NET;.NET Core,Microsoft Teams;Microsoft Azure;Trello,Confluence;Jira;Slack;Microsoft Azure;Trello,No,Somewhat important,Fairly important,,,Once a year,Not sure,,No,Often: 1-2 days per week or more,Start a free trial;Ask developers I know/work ...,Amused,Stack Overflow (public Q&A for anyone who codes),Visit Stack Overflow;Go for a walk or other ph...,Windows,2 to 9 employees,Android;iOS;Kubernetes;Microsoft Azure;Windows,Windows,,Straight / Heterosexual,No,"No, not at all",,Multiple times per day,Neither easy nor difficult,Appropriate in length,No,"Computer science, computer engineering, or sof...",ASP.NET Core,ASP.NET;ASP.NET Core,Just as welcome now as I felt last year,50.0,36,27


In [4]:
# You can create your own data frames from structures 
# (list, sets for example). Lets take a look:
people = {
    'name': ['Power', 'Rangers', 'Ritta'],
    'last_name': ['Zordon', 'Hulk', 'SikiTa'],
    'age': [22, 11, 33]
}
# and now lets transfer it into data frame:
new_df = pd.DataFrame(people)
new_df

Unnamed: 0,name,last_name,age
0,Power,Zordon,22
1,Rangers,Hulk,11
2,Ritta,SikiTa,33


In [5]:
# We have DataFrames and Series:
print(type(new_df['name']), type(new_df))

<class 'pandas.core.series.Series'> <class 'pandas.core.frame.DataFrame'>


In [6]:
# Series: a list of data, but has more functionality than a python list. It's 1-D array.
# DataFrame: 2-D array; think of it as rows and columns. It is a container for 
# multiple of series object

In [7]:
new_df[['name', 'age']] # we may acces few columns like this

Unnamed: 0,name,age
0,Power,22
1,Rangers,11
2,Ritta,33


In [8]:
# GET A COLUMN:
# use .columns to grab some specific col:
new_df.columns

Index(['name', 'last_name', 'age'], dtype='object')

In [9]:
# GET A ROW:
# use .loc[] or .iloc[] to grab a row
# iloc[] - allows access rows by INT location, hence the name - integerLocation. 
# Say we wanna access 1st row using iloc, we use:
new_df.iloc[0]
# we might also access couple rows:
new_df.iloc[[0, 1]]

Unnamed: 0,name,last_name,age
0,Power,Zordon,22
1,Rangers,Hulk,11


### what is more, we might access also data by providing **<u>ROWS-COLS</u>** with iloc and loc:

In [10]:
# what is more, we might access also data by providing rows-cols with iloc:
new_df.iloc[[0, 1], [1, 2]] # since its iloc then INTEGERS ONLY! new_df.iloc[[0, 1], [2]]
# works too for 1 col but few rows (we could have provided name of col if it was loc)

Unnamed: 0,last_name,age
0,Zordon,22
1,Hulk,11


In [11]:
# loc - gets rows (x cols as 2nd arg) of LABELS/STRING this time, not INT
# but dont get confused, right now our new_df has on the very left some "numbers",
# in fact they are labels but we will cover it in further notes

In [12]:
new_df.loc[[0, 1], ['name', 'age']] # NOTICE we call to cols by labels
# and ofc you could have called 1 col too: ... .loc[[0, 1], ['name']]

Unnamed: 0,name,age
0,Power,22
1,Rangers,11


In [108]:
schema = pd.read_csv('schema.csv')
schema.head(1)

Unnamed: 0,Column,QuestionText
0,Respondent,Randomized respondent ID number (not in order ...


In [14]:
df['Hobbyist'].value_counts() # just a quick tatse, counts vals (yes and no) in this col

Yes    50388
No     14028
Name: Hobbyist, dtype: int64

In [15]:
df.loc[[0], ['Hobbyist']]

Unnamed: 0,Hobbyist
0,Yes


In [16]:
df.loc[0] # all data user 1 (on index 0) filled and had to answer example:

Respondent                                                  1
MainBranch                     I am a developer by profession
Hobbyist                                                  Yes
Age                                                       NaN
Age1stCode                                                 13
                                       ...                   
WebframeWorkedWith                       ASP.NET;ASP.NET Core
WelcomeChange         Just as welcome now as I felt last year
WorkWeekHrs                                              50.0
YearsCode                                                  36
YearsCodePro                                               27
Name: 0, Length: 61, dtype: object

In [18]:
# IMPORTANT: if you passing a slice, say df.loc[0:2, ['COL_NAME']] notice that we dont use brackets in slices, 
# and slices are INCLUSIVE so will print 2 results:
df.loc[0:2, ['Hobbyist']]

Unnamed: 0,Hobbyist
0,Yes
1,No
2,Yes


In [19]:
# we may also use these slices INCLUSIVE on columns:
df.loc[0:2, 'Hobbyist':'Employment'] # look at columns slice, you can alsways check names of columns by df.columns

Unnamed: 0,Hobbyist,Age,Age1stCode,CompFreq,CompTotal,ConvertedComp,Country,CurrencyDesc,CurrencySymbol,DatabaseDesireNextYear,DatabaseWorkedWith,DevType,EdLevel,Employment
0,Yes,,13,Monthly,,,Germany,European Euro,EUR,Microsoft SQL Server,Elasticsearch;Microsoft SQL Server;Oracle,"Developer, desktop or enterprise applications;...","Master’s degree (M.A., M.S., M.Eng., MBA, etc.)","Independent contractor, freelancer, or self-em..."
1,No,,19,,,,United Kingdom,Pound sterling,GBP,,,"Developer, full-stack;Developer, mobile","Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Employed full-time
2,Yes,,15,,,,Russian Federation,,,,,,,


---
## Let's talk deeper about INDEXES (set, reset, use indexes):

In [26]:
people2 = {
    'name': ['Power', 'Rangers', 'Ritta'],
    'email': ['Zordon@gf.gg', 'Hulk@wp.gg', 'SikiTa@gg.gg'],
    'age': [22, 11, 33]
}
new_df2 = pd.DataFrame(people2)
new_df2  # we have this data generated from dict PEOPLE above

Unnamed: 0,name,email,age
0,Power,Zordon@gf.gg,22
1,Rangers,Hulk@wp.gg,11
2,Ritta,SikiTa@gg.gg,33


In [24]:
# lets make these indexes (unique), so use email address:
new_df2['email']

0    Zordon@gf.gg
1      Hulk@wp.gg
2    SikiTa@gg.gg
Name: email, dtype: object

In [28]:
new_df2.set_index('email') # NOTICE that it DOES NOT changes our original DataFrame! look below, it retains the state declared on the very first run

Unnamed: 0_level_0,name,age
email,Unnamed: 1_level_1,Unnamed: 2_level_1
Zordon@gf.gg,Power,22
Hulk@wp.gg,Rangers,11
SikiTa@gg.gg,Ritta,33


In [30]:
new_df2 # see, no changes on index in our original state, which is awesome
# but what if you indeed want to modify it for permanent change? just add an argument: ... , inplace=True). Looks like this:
# new_df2.set_index('email', inplace=True) # so this will permanently modify our DataFrame. Actually lets see this in action below

Unnamed: 0,name,email,age
0,Power,Zordon@gf.gg,22
1,Rangers,Hulk@wp.gg,11
2,Ritta,SikiTa@gg.gg,33


In [32]:
new_df2.index # proof of current indexes:

RangeIndex(start=0, stop=3, step=1)

In [37]:
# and now after changes:
new_df2.set_index('email', inplace=True) # inplace arg makes PERMANENT CHANGE
new_df2.index # proof of current indexes AFTER changes:

Index(['Zordon@gf.gg', 'Hulk@wp.gg', 'SikiTa@gg.gg'], dtype='object', name='email')

In [46]:
# since we have indexes now as string, it's great to show you .loc and .iloc again in action:
new_df2.iloc[0] # .iloc[]

name    Power
age        22
Name: Zordon@gf.gg, dtype: object

In [49]:
new_df2.loc['Zordon@gf.gg'] # .loc[]

name    Power
age        22
Name: Zordon@gf.gg, dtype: object

In [50]:
new_df2

Unnamed: 0_level_0,name,age
email,Unnamed: 1_level_1,Unnamed: 2_level_1
Zordon@gf.gg,Power,22
Hulk@wp.gg,Rangers,11
SikiTa@gg.gg,Ritta,33


In [55]:
# Back to out df DataFrame, time to do some clean ups. Lets begin from indexes. 
# Notice col Respondent - seems like this is unique id, so lets make it as indexes in our data Frame:
df.head(2)

Unnamed: 0,Respondent,MainBranch,Hobbyist,Age,Age1stCode,CompFreq,CompTotal,ConvertedComp,Country,CurrencyDesc,CurrencySymbol,DatabaseDesireNextYear,DatabaseWorkedWith,DevType,EdLevel,Employment,Ethnicity,Gender,JobFactors,JobSat,JobSeek,LanguageDesireNextYear,LanguageWorkedWith,MiscTechDesireNextYear,MiscTechWorkedWith,NEWCollabToolsDesireNextYear,NEWCollabToolsWorkedWith,NEWDevOps,NEWDevOpsImpt,NEWEdImpt,NEWJobHunt,NEWJobHuntResearch,NEWLearn,NEWOffTopic,NEWOnboardGood,NEWOtherComms,NEWOvertime,NEWPurchaseResearch,NEWPurpleLink,NEWSOSites,NEWStuck,OpSys,OrgSize,PlatformDesireNextYear,PlatformWorkedWith,PurchaseWhat,Sexuality,SOAccount,SOComm,SOPartFreq,SOVisitFreq,SurveyEase,SurveyLength,Trans,UndergradMajor,WebframeDesireNextYear,WebframeWorkedWith,WelcomeChange,WorkWeekHrs,YearsCode,YearsCodePro
0,1,I am a developer by profession,Yes,,13,Monthly,,,Germany,European Euro,EUR,Microsoft SQL Server,Elasticsearch;Microsoft SQL Server;Oracle,"Developer, desktop or enterprise applications;...","Master’s degree (M.A., M.S., M.Eng., MBA, etc.)","Independent contractor, freelancer, or self-em...",White or of European descent,Man,"Languages, frameworks, and other technologies ...",Slightly satisfied,I am not interested in new job opportunities,C#;HTML/CSS;JavaScript,C#;HTML/CSS;JavaScript,.NET Core;Xamarin,.NET;.NET Core,Microsoft Teams;Microsoft Azure;Trello,Confluence;Jira;Slack;Microsoft Azure;Trello,No,Somewhat important,Fairly important,,,Once a year,Not sure,,No,Often: 1-2 days per week or more,Start a free trial;Ask developers I know/work ...,Amused,Stack Overflow (public Q&A for anyone who codes),Visit Stack Overflow;Go for a walk or other ph...,Windows,2 to 9 employees,Android;iOS;Kubernetes;Microsoft Azure;Windows,Windows,,Straight / Heterosexual,No,"No, not at all",,Multiple times per day,Neither easy nor difficult,Appropriate in length,No,"Computer science, computer engineering, or sof...",ASP.NET Core,ASP.NET;ASP.NET Core,Just as welcome now as I felt last year,50.0,36,27
1,2,I am a developer by profession,No,,19,,,,United Kingdom,Pound sterling,GBP,,,"Developer, full-stack;Developer, mobile","Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Employed full-time,,,,Very dissatisfied,I am not interested in new job opportunities,Python;Swift,JavaScript;Swift,React Native;TensorFlow;Unity 3D,React Native,Github;Slack,Confluence;Jira;Github;Gitlab;Slack,,,Fairly important,,,Once a year,Not sure,,No,,,Amused,Stack Overflow (public Q&A for anyone who code...,Visit Stack Overflow;Go for a walk or other ph...,MacOS,"1,000 to 4,999 employees",iOS;Kubernetes;Linux;MacOS,iOS,I have little or no influence,,Yes,"Yes, definitely",Less than once per month or monthly,Multiple times per day,,,,"Computer science, computer engineering, or sof...",,,Somewhat more welcome now than last year,,7,4


In [57]:
df.set_index('Respondent', inplace=True)

In [59]:
# DONE:
df.head(2) # now we have indexes Responden and they are probably are ints, lets check the type

Unnamed: 0_level_0,MainBranch,Hobbyist,Age,Age1stCode,CompFreq,CompTotal,ConvertedComp,Country,CurrencyDesc,CurrencySymbol,DatabaseDesireNextYear,DatabaseWorkedWith,DevType,EdLevel,Employment,Ethnicity,Gender,JobFactors,JobSat,JobSeek,LanguageDesireNextYear,LanguageWorkedWith,MiscTechDesireNextYear,MiscTechWorkedWith,NEWCollabToolsDesireNextYear,NEWCollabToolsWorkedWith,NEWDevOps,NEWDevOpsImpt,NEWEdImpt,NEWJobHunt,NEWJobHuntResearch,NEWLearn,NEWOffTopic,NEWOnboardGood,NEWOtherComms,NEWOvertime,NEWPurchaseResearch,NEWPurpleLink,NEWSOSites,NEWStuck,OpSys,OrgSize,PlatformDesireNextYear,PlatformWorkedWith,PurchaseWhat,Sexuality,SOAccount,SOComm,SOPartFreq,SOVisitFreq,SurveyEase,SurveyLength,Trans,UndergradMajor,WebframeDesireNextYear,WebframeWorkedWith,WelcomeChange,WorkWeekHrs,YearsCode,YearsCodePro
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1
1,I am a developer by profession,Yes,,13,Monthly,,,Germany,European Euro,EUR,Microsoft SQL Server,Elasticsearch;Microsoft SQL Server;Oracle,"Developer, desktop or enterprise applications;...","Master’s degree (M.A., M.S., M.Eng., MBA, etc.)","Independent contractor, freelancer, or self-em...",White or of European descent,Man,"Languages, frameworks, and other technologies ...",Slightly satisfied,I am not interested in new job opportunities,C#;HTML/CSS;JavaScript,C#;HTML/CSS;JavaScript,.NET Core;Xamarin,.NET;.NET Core,Microsoft Teams;Microsoft Azure;Trello,Confluence;Jira;Slack;Microsoft Azure;Trello,No,Somewhat important,Fairly important,,,Once a year,Not sure,,No,Often: 1-2 days per week or more,Start a free trial;Ask developers I know/work ...,Amused,Stack Overflow (public Q&A for anyone who codes),Visit Stack Overflow;Go for a walk or other ph...,Windows,2 to 9 employees,Android;iOS;Kubernetes;Microsoft Azure;Windows,Windows,,Straight / Heterosexual,No,"No, not at all",,Multiple times per day,Neither easy nor difficult,Appropriate in length,No,"Computer science, computer engineering, or sof...",ASP.NET Core,ASP.NET;ASP.NET Core,Just as welcome now as I felt last year,50.0,36,27
2,I am a developer by profession,No,,19,,,,United Kingdom,Pound sterling,GBP,,,"Developer, full-stack;Developer, mobile","Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Employed full-time,,,,Very dissatisfied,I am not interested in new job opportunities,Python;Swift,JavaScript;Swift,React Native;TensorFlow;Unity 3D,React Native,Github;Slack,Confluence;Jira;Github;Gitlab;Slack,,,Fairly important,,,Once a year,Not sure,,No,,,Amused,Stack Overflow (public Q&A for anyone who code...,Visit Stack Overflow;Go for a walk or other ph...,MacOS,"1,000 to 4,999 employees",iOS;Kubernetes;Linux;MacOS,iOS,I have little or no influence,,Yes,"Yes, definitely",Less than once per month or monthly,Multiple times per day,,,,"Computer science, computer engineering, or sof...",,,Somewhat more welcome now than last year,,7,4


In [63]:
df.index # as we suspected, they are ints, cool

Int64Index([    1,     2,     3,     4,     5,     6,     7,     8,     9,
               10,
            ...
            64104, 64236, 64330, 64480, 64567, 64858, 64867, 64898, 64925,
            65112],
           dtype='int64', name='Respondent', length=64461)

In [106]:
df.iloc[1, 0]

'I am a developer by profession'

In [111]:
schema.set_index('Column', inplace=True)

In [114]:
schema.head(2)

Unnamed: 0_level_0,QuestionText
Column,Unnamed: 1_level_1
Respondent,Randomized respondent ID number (not in order ...
MainBranch,Which of the following options best describes ...


In [117]:
# once you've managed to set custom indexes, you may sort them for example:
schema.sort_index().head(4) # use .sort_index(ascending=False) for reverse order :)

Unnamed: 0_level_0,QuestionText
Column,Unnamed: 1_level_1
Age,What is your age (in years)? If you prefer not...
Age1stCode,At what age did you write your first line of c...
CompFreq,"Is that compensation weekly, monthly, or yearly?"
CompTotal,What is your current total compensation (salar...


In [118]:
# AND REMEMBER, if you want this changes PERMANENT, just add arg: inplace=True.
# so schema.sort_index(inplace=True [, ascending=..., args...]) would be used for PERMA change the DataFrame

---
## Filtering - Using Conditionals to Filter Rows and Columns