## Example 1 cleaning data

In [1]:
import pandas as pd

In [2]:
print(pd.__version__)

1.4.1


In [3]:
exam_data = pd.read_csv('../data/exams.csv', quotechar='"')
exam_data

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group D,some high school,standard,none,51,60,60
1,female,group E,some college,free/reduced,completed,70,68,78
2,female,group B,some high school,standard,none,56,59,57
3,female,group C,high school,standard,completed,89,96,100
4,female,group A,some high school,free/reduced,none,43,52,56
...,...,...,...,...,...,...,...,...
95,male,group E,bachelor's degree,free/reduced,none,87,93,80
96,male,group E,associate's degree,standard,completed,72,72,63
97,female,group B,associate's degree,standard,none,76,88,86
98,female,group C,high school,standard,none,49,61,55


In [4]:
math_avg = exam_data['math score'].mean()
reading_avg = exam_data['reading score'].mean()
writing_avg = exam_data['writing score'].mean()

print('Math avg:  ', math_avg)
print('Reading avg:  ', reading_avg)
print('Writing avg:  ', writing_avg)

Math avg:   69.09
Reading avg:   71.98
Writing avg:   71.06


Preprocessing to standardize data

In [5]:
from sklearn import preprocessing

exam_data[['math score']] = preprocessing.scale(exam_data[['math score']])
exam_data['reading score'] = preprocessing.scale(exam_data['reading score'])
exam_data['writing score'] = preprocessing.scale(exam_data['writing score'])

In [6]:
exam_data

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group D,some high school,standard,none,-1.301264,-0.882459,-0.801752
1,female,group E,some college,free/reduced,completed,0.065459,-0.293171,0.503088
2,female,group B,some high school,standard,none,-0.941600,-0.956120,-1.019225
3,female,group C,high school,standard,completed,1.432182,1.769338,2.097893
4,female,group A,some high school,free/reduced,none,-1.876726,-1.471747,-1.091716
...,...,...,...,...,...,...,...,...
95,male,group E,bachelor's degree,free/reduced,none,1.288316,1.548355,0.648070
96,male,group E,associate's degree,standard,completed,0.209324,0.001473,-0.584278
97,female,group B,associate's degree,standard,none,0.497056,1.180050,1.083017
98,female,group C,high school,standard,none,-1.445130,-0.808798,-1.164207


In [7]:
math_avg = exam_data['math score'].mean()
reading_avg = exam_data['reading score'].mean()
writing_avg = exam_data['writing score'].mean()

print('Math avg:  ', math_avg)
print('Reading avg:  ', reading_avg)
print('Writing avg:  ', writing_avg)

Math avg:   -2.4424906541753446e-16
Reading avg:   -2.9309887850104134e-16
Writing avg:   -1.709743457922741e-16


Representing Labels as Numerical Values

In [8]:
le = preprocessing.LabelEncoder()

exam_data['gender'] = le.fit_transform(exam_data['gender']).astype(str)

In [9]:
exam_data.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,0,group D,some high school,standard,none,-1.301264,-0.882459,-0.801752
1,0,group E,some college,free/reduced,completed,0.065459,-0.293171,0.503088
2,0,group B,some high school,standard,none,-0.9416,-0.95612,-1.019225
3,0,group C,high school,standard,completed,1.432182,1.769338,2.097893
4,0,group A,some high school,free/reduced,none,-1.876726,-1.471747,-1.091716


In [10]:
# See encoded columns/classes
le.classes_

array(['female', 'male'], dtype=object)

In [11]:
exam_data['race/ethnicity'] = le.fit_transform(exam_data['race/ethnicity']).astype(str)

In [12]:
exam_data

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,0,3,some high school,standard,none,-1.301264,-0.882459,-0.801752
1,0,4,some college,free/reduced,completed,0.065459,-0.293171,0.503088
2,0,1,some high school,standard,none,-0.941600,-0.956120,-1.019225
3,0,2,high school,standard,completed,1.432182,1.769338,2.097893
4,0,0,some high school,free/reduced,none,-1.876726,-1.471747,-1.091716
...,...,...,...,...,...,...,...,...
95,1,4,bachelor's degree,free/reduced,none,1.288316,1.548355,0.648070
96,1,4,associate's degree,standard,completed,0.209324,0.001473,-0.584278
97,0,1,associate's degree,standard,none,0.497056,1.180050,1.083017
98,0,2,high school,standard,none,-1.445130,-0.808798,-1.164207


In [13]:
pd.get_dummies(exam_data['race/ethnicity'])

Unnamed: 0,0,1,2,3,4
0,0,0,0,1,0
1,0,0,0,0,1
2,0,1,0,0,0
3,0,0,1,0,0
4,1,0,0,0,0
...,...,...,...,...,...
95,0,0,0,0,1
96,0,0,0,0,1
97,0,1,0,0,0
98,0,0,1,0,0


In [14]:
exam_data = pd.get_dummies(exam_data, columns=['parental level of education', 'lunch', 'test preparation course'])

exam_data.head()

Unnamed: 0,gender,race/ethnicity,math score,reading score,writing score,parental level of education_associate's degree,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,parental level of education_some high school,lunch_free/reduced,lunch_standard,test preparation course_completed,test preparation course_none
0,0,3,-1.301264,-0.882459,-0.801752,0,0,0,0,0,1,0,1,0,1
1,0,4,0.065459,-0.293171,0.503088,0,0,0,0,1,0,1,0,1,0
2,0,1,-0.9416,-0.95612,-1.019225,0,0,0,0,0,1,0,1,0,1
3,0,2,1.432182,1.769338,2.097893,0,0,1,0,0,0,0,1,1,0
4,0,0,-1.876726,-1.471747,-1.091716,0,0,0,0,0,1,1,0,0,1


## Working with Text

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

In [16]:
corpus = ['This is the first document.', 
          'This is the second document.',
         'This is the third document. Document four.',
         'Number five. To repeat, number five.']

In [17]:
vectorizer = CountVectorizer()
bag_of_words = vectorizer.fit_transform(corpus)

bag_of_words

<4x12 sparse matrix of type '<class 'numpy.int64'>'
	with 20 stored elements in Compressed Sparse Row format>

In [18]:
## (index, unique_id_of_word)   word_count
print(bag_of_words)

  (0, 10)	1
  (0, 4)	1
  (0, 8)	1
  (0, 1)	1
  (0, 0)	1
  (1, 10)	1
  (1, 4)	1
  (1, 8)	1
  (1, 0)	1
  (1, 7)	1
  (2, 10)	1
  (2, 4)	1
  (2, 8)	1
  (2, 0)	2
  (2, 9)	1
  (2, 3)	1
  (3, 5)	2
  (3, 2)	2
  (3, 11)	1
  (3, 6)	1


In [19]:
vectorizer.vocabulary_.get('document')

0

In [20]:
vectorizer.vocabulary_

{'this': 10,
 'is': 4,
 'the': 8,
 'first': 1,
 'document': 0,
 'second': 7,
 'third': 9,
 'four': 3,
 'number': 5,
 'five': 2,
 'to': 11,
 'repeat': 6}

In [21]:
import pandas as pd

In [22]:
print(bag_of_words.toarray())

[[1 1 0 0 1 0 0 0 1 0 1 0]
 [1 0 0 0 1 0 0 1 1 0 1 0]
 [2 0 0 1 1 0 0 0 1 1 1 0]
 [0 0 2 0 0 2 1 0 0 0 0 1]]


In [23]:
pd.DataFrame(bag_of_words.toarray(), columns=vectorizer.get_feature_names_out())

Unnamed: 0,document,first,five,four,is,number,repeat,second,the,third,this,to
0,1,1,0,0,1,0,0,0,1,0,1,0
1,1,0,0,0,1,0,0,1,1,0,1,0
2,2,0,0,1,1,0,0,0,1,1,1,0
3,0,0,2,0,0,2,1,0,0,0,0,1


In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
bag_of_words = vectorizer.fit_transform(corpus)

In [25]:
print(bag_of_words)

  (0, 0)	0.39361148805201995
  (0, 1)	0.6166684570284894
  (0, 8)	0.39361148805201995
  (0, 4)	0.39361148805201995
  (0, 10)	0.39361148805201995
  (1, 7)	0.6166684570284894
  (1, 0)	0.39361148805201995
  (1, 8)	0.39361148805201995
  (1, 4)	0.39361148805201995
  (1, 10)	0.39361148805201995
  (2, 3)	0.4539889802043244
  (2, 9)	0.4539889802043244
  (2, 0)	0.5795505705562224
  (2, 8)	0.2897752852781112
  (2, 4)	0.2897752852781112
  (2, 10)	0.2897752852781112
  (3, 6)	0.31622776601683794
  (3, 11)	0.31622776601683794
  (3, 2)	0.6324555320336759
  (3, 5)	0.6324555320336759


In [26]:
vectorizer.vocabulary_.get('document')

0

In [28]:
pd.DataFrame(bag_of_words.toarray(), columns=vectorizer.get_feature_names_out())

Unnamed: 0,document,first,five,four,is,number,repeat,second,the,third,this,to
0,0.393611,0.616668,0.0,0.0,0.393611,0.0,0.0,0.0,0.393611,0.0,0.393611,0.0
1,0.393611,0.0,0.0,0.0,0.393611,0.0,0.0,0.616668,0.393611,0.0,0.393611,0.0
2,0.579551,0.0,0.0,0.453989,0.289775,0.0,0.0,0.0,0.289775,0.453989,0.289775,0.0
3,0.0,0.0,0.632456,0.0,0.0,0.632456,0.316228,0.0,0.0,0.0,0.0,0.316228


In [29]:
# HashVectorizer used for large amounts of data. 
# Words can't be converted back from id value
from sklearn.feature_extraction.text import HashingVectorizer

vectorizer = HashingVectorizer(n_features=8)
feature_vector = vectorizer.fit_transform(corpus)
print(feature_vector)

  (0, 0)	-0.8944271909999159
  (0, 5)	0.4472135954999579
  (0, 6)	0.0
  (1, 0)	-0.5773502691896258
  (1, 3)	0.5773502691896258
  (1, 5)	0.5773502691896258
  (1, 6)	0.0
  (2, 0)	-0.5547001962252291
  (2, 5)	0.8320502943378437
  (2, 6)	0.0
  (3, 0)	0.31622776601683794
  (3, 1)	0.6324555320336759
  (3, 3)	0.31622776601683794
  (3, 7)	0.6324555320336759


## Working with images

In [32]:
!pip install --trusted-host pypi.org --trusted-host pypi.python.org --trusted-host files.pythonhosted.org opencv-python

Collecting opencv-python
  Downloading opencv_python-4.5.5.64-cp36-abi3-win_amd64.whl (20 kB)


ERROR: THESE PACKAGES DO NOT MATCH THE HASHES FROM THE REQUIREMENTS FILE. If you have updated the package versions, please update the hashes. Otherwise, examine the package contents carefully; someone may have tampered with them.
    opencv-python from https://files.pythonhosted.org/packages/48/c3/798bd7b8f78430f82ec0660b753106717e4e4bb8032ce56f77d8577cb24b/opencv_python-4.5.5.64-cp36-abi3-win_amd64.whl#sha256=408d5332550287aa797fd06bef47b2dfed163c6787668cc82ef9123a9484b56a:
        Expected sha256 408d5332550287aa797fd06bef47b2dfed163c6787668cc82ef9123a9484b56a
             Got        8b96c55f54785805e3e3763499d29e50c497ef069113f9822c52c000d07de9e6

You should consider upgrading via the 'C:\Python310\python.exe -m pip install --upgrade pip' command.


In [31]:
import cv2

ModuleNotFoundError: No module named 'cv2'