Numpy introduction
numpy arrays

In [1]:
import numpy as np
arr = np.array([1,3,4,5,6])
arr

array([1, 3, 4, 5, 6])

In [2]:
arr.shape

(5,)

In [3]:
arr.dtype

dtype('int64')

In [4]:
arr = np.array([1,'st','er',3])
arr.dtype

dtype('<U21')

In [5]:
np.sum(arr)

UFuncTypeError: ufunc 'add' did not contain a loop with signature matching types (dtype('<U21'), dtype('<U21')) -> None

In [None]:
# creating arrays
arr = np.array([[1,2,3],[2,4,6],[8,8,8]])
arr.shape

In [None]:
arr

In [None]:
arr = np.zeros((2,4))
arr

In [None]:
arr = np.ones((2,4))
arr

In [None]:
arr = np.identity(3)
arr

In [None]:
arr = np.random.randn(3,4)
arr

In [None]:
from io import BytesIO
b = BytesIO(b"2,23,33\n32,42,63.4\n35,77,12")
arr = np.genfromtxt(b, delimiter=",")
arr

In [None]:
# Accessing array elements
# simple indexing
arr[1]

In [None]:
arr = np.arange(12).reshape(2,2,3)
arr

In [None]:
arr[0]

In [None]:
arr = np.arange(10)
arr[5:]

In [None]:
arr[5:8]

In [None]:
arr[:-5]

In [None]:
arr = np.arange(12).reshape(2,2,3)
arr

In [None]:
arr[1:2]

In [None]:
arr = np.arange(27).reshape(3,3,3)
arr

In [None]:
arr[:,:,2]

In [None]:
arr[...,2]

Advanced indexing

In [None]:
arr = np.arange(9).reshape(3,3)
arr

In [None]:
arr[[0,1,2],[1,0,0]]

Boolean indexing

In [None]:
cities = np.array(["delhi","banglaore","mumbai","chennai","bhopal"])
city_data = np.random.randn(5,3)
city_data

In [None]:
city_data[cities == "delhi"]

In [None]:
city_data[city_data >0]

In [None]:
city_data[city_data > 0] = 0
city_data

Operations on arrays

https://numpy.org/doc/stable/reference/ufuncs.html

In [None]:
arr = np.arange(15).reshape(3,5)
arr

In [None]:
arr + 5

In [None]:
arr *2

In [None]:
arr1 = np.arange(15).reshape(5,3)
arr2 = np.arange(5).reshape(5,1)


In [None]:
arr1


In [None]:
arr2


In [None]:
arr2 + arr1

In [None]:
arr1 = np.random.randn(5,3)
arr1

In [None]:
np.modf(arr1) #modf will return the fractional and the integer part of the input supplied to it

Linear algebra using numpy

https://numpy.org/doc/stable/reference/routines.linalg.html

In [None]:
A = np.array([[1,2,3],[4,5,6],[7,8,9]])
B = np.array([[9,8,7],[6,5,4],[1,2,3]])
A.dot(B)

In [None]:
A = np.arange(15).reshape(3,5)
A.T # transpose of a matrix

In [None]:
np.linalg.svd(A) # matrix factorization

In [None]:
a = np.array([[7,5,-3],[3,-5,2],[5,3,-7]])
b = np.array([16,-8,0])
x= np.linalg.solve(a,b) # solve a system of equations
x

#### 7x + 5y -3z = 16
#### 3x - 5y + 2z = -8
#### 5x + 3y - 7z = 0

In [None]:
np.allclose(np.dot(a,x),b) # check if the solution is correct

# Pandas

In [7]:
# Data frames
import pandas as pd
d = [{'city':'Delhi',"data":1000},
    {'city':'Banglaore',"data":2000},
    {'city':'Mumbai',"data":1000}]
pd.DataFrame(d)

Unnamed: 0,city,data
0,Delhi,1000
1,Banglaore,2000
2,Mumbai,1000


In [8]:
df = pd.DataFrame(d)

In [9]:
# reading in data
city_data = pd.read_csv(filepath_or_buffer='simplemaps-worldcities-basic.csv')

In [10]:
city_data.head(n=10)

Unnamed: 0,city,city_ascii,lat,lng,pop,country,iso2,iso3,province
0,Qal eh-ye Now,Qal eh-ye,34.983,63.1333,2997.0,Afghanistan,AF,AFG,Badghis
1,Chaghcharan,Chaghcharan,34.516701,65.250001,15000.0,Afghanistan,AF,AFG,Ghor
2,Lashkar Gah,Lashkar Gah,31.582998,64.36,201546.0,Afghanistan,AF,AFG,Hilmand
3,Zaranj,Zaranj,31.112001,61.886998,49851.0,Afghanistan,AF,AFG,Nimroz
4,Tarin Kowt,Tarin Kowt,32.633298,65.866699,10000.0,Afghanistan,AF,AFG,Uruzgan
5,Zareh Sharan,Zareh Sharan,32.85,68.416705,13737.0,Afghanistan,AF,AFG,Paktika
6,Asadabad,Asadabad,34.866,71.150005,48400.0,Afghanistan,AF,AFG,Kunar
7,Taloqan,Taloqan,36.729999,69.540004,64256.0,Afghanistan,AF,AFG,Takhar
8,Mahmud-E Eraqi,Mahmud-E Eraqi,35.016696,69.333301,7407.0,Afghanistan,AF,AFG,Kapisa
9,Mehtar Lam,Mehtar Lam,34.65,70.166701,17345.0,Afghanistan,AF,AFG,Laghman


In [11]:
city_data.tail()

Unnamed: 0,city,city_ascii,lat,lng,pop,country,iso2,iso3,province
7317,Mutare,Mutare,-18.970019,32.650038,216785.0,Zimbabwe,ZW,ZWE,Manicaland
7318,Kadoma,Kadoma,-18.330006,29.909947,56400.0,Zimbabwe,ZW,ZWE,Mashonaland West
7319,Chitungwiza,Chitungwiza,-18.000001,31.100003,331071.0,Zimbabwe,ZW,ZWE,Harare
7320,Harare,Harare,-17.81779,31.044709,1557406.5,Zimbabwe,ZW,ZWE,Harare
7321,Bulawayo,Bulawayo,-20.169998,28.580002,697096.0,Zimbabwe,ZW,ZWE,Bulawayo


In [12]:
series_es = city_data.lat

In [13]:
type(series_es)

pandas.core.series.Series

In [14]:
series_es[1:10:2]

1    34.516701
3    31.112001
5    32.850000
7    36.729999
9    34.650000
Name: lat, dtype: float64

In [15]:
series_es[:7]

0    34.983000
1    34.516701
2    31.582998
3    31.112001
4    32.633298
5    32.850000
6    34.866000
Name: lat, dtype: float64

In [16]:
series_es[:-7315]

0    34.983000
1    34.516701
2    31.582998
3    31.112001
4    32.633298
5    32.850000
6    34.866000
Name: lat, dtype: float64

In [17]:
city_data[:7]

Unnamed: 0,city,city_ascii,lat,lng,pop,country,iso2,iso3,province
0,Qal eh-ye Now,Qal eh-ye,34.983,63.1333,2997.0,Afghanistan,AF,AFG,Badghis
1,Chaghcharan,Chaghcharan,34.516701,65.250001,15000.0,Afghanistan,AF,AFG,Ghor
2,Lashkar Gah,Lashkar Gah,31.582998,64.36,201546.0,Afghanistan,AF,AFG,Hilmand
3,Zaranj,Zaranj,31.112001,61.886998,49851.0,Afghanistan,AF,AFG,Nimroz
4,Tarin Kowt,Tarin Kowt,32.633298,65.866699,10000.0,Afghanistan,AF,AFG,Uruzgan
5,Zareh Sharan,Zareh Sharan,32.85,68.416705,13737.0,Afghanistan,AF,AFG,Paktika
6,Asadabad,Asadabad,34.866,71.150005,48400.0,Afghanistan,AF,AFG,Kunar


In [18]:
city_data.iloc[:5,:4]

Unnamed: 0,city,city_ascii,lat,lng
0,Qal eh-ye Now,Qal eh-ye,34.983,63.1333
1,Chaghcharan,Chaghcharan,34.516701,65.250001
2,Lashkar Gah,Lashkar Gah,31.582998,64.36
3,Zaranj,Zaranj,31.112001,61.886998
4,Tarin Kowt,Tarin Kowt,32.633298,65.866699


In [19]:
city_data[city_data['pop'] > 10000000][city_data.columns[pd.Series(city_data.columns).str.startswith('l')]]

Unnamed: 0,lat,lng
360,-34.602502,-58.397531
1171,-23.55868,-46.62502
2068,31.216452,121.436505
3098,28.669993,77.230004
3110,19.01699,72.856989
3492,35.685017,139.751407
4074,19.442442,-99.130988
4513,24.869992,66.990009
5394,55.752164,37.615523
6124,41.104996,29.010002


In [20]:
city_greater_10mil = city_data[city_data['pop'] > 10000000]
city_greater_10mil.rename(columns={'pop':'population'}, inplace=True)
city_greater_10mil.where(city_greater_10mil.population > 15000000)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  city_greater_10mil.rename(columns={'pop':'population'}, inplace=True)


Unnamed: 0,city,city_ascii,lat,lng,population,country,iso2,iso3,province
360,,,,,,,,,
1171,,,,,,,,,
2068,,,,,,,,,
3098,,,,,,,,,
3110,Mumbai,Mumbai,19.01699,72.856989,15834918.0,India,IN,IND,Maharashtra
3492,Tokyo,Tokyo,35.685017,139.751407,22006299.5,Japan,JP,JPN,Tokyo
4074,,,,,,,,,
4513,,,,,,,,,
5394,,,,,,,,,
6124,,,,,,,,,


In [21]:
df = pd.DataFrame(np.random.rand(8,3),
                 columns=['A','B','C'])
df

Unnamed: 0,A,B,C
0,0.1587,0.094656,0.13751
1,0.650685,0.950634,0.256311
2,0.391696,0.868355,0.913967
3,0.371166,0.999633,0.846034
4,0.152068,0.875677,0.518127
5,0.913301,0.03569,0.427451
6,0.927069,0.052564,0.869088
7,0.937398,0.640469,0.362137


Operations on dataframes

In [22]:
nparray = df.values
type(nparray)

numpy.ndarray

In [23]:
from numpy import nan
df.iloc[4,2] = nan
df

Unnamed: 0,A,B,C
0,0.1587,0.094656,0.13751
1,0.650685,0.950634,0.256311
2,0.391696,0.868355,0.913967
3,0.371166,0.999633,0.846034
4,0.152068,0.875677,
5,0.913301,0.03569,0.427451
6,0.927069,0.052564,0.869088
7,0.937398,0.640469,0.362137


In [24]:
df.fillna(0)

Unnamed: 0,A,B,C
0,0.1587,0.094656,0.13751
1,0.650685,0.950634,0.256311
2,0.391696,0.868355,0.913967
3,0.371166,0.999633,0.846034
4,0.152068,0.875677,0.0
5,0.913301,0.03569,0.427451
6,0.927069,0.052564,0.869088
7,0.937398,0.640469,0.362137


In [25]:
columns_numeric = ['lat','lng','pop']

In [26]:
city_data[columns_numeric].mean()

lat        20.662876
lng        10.711914
pop    265463.071633
dtype: float64

In [27]:
city_data[columns_numeric].sum()

lat    1.512936e+05
lng    7.843263e+04
pop    1.943721e+09
dtype: float64

In [28]:
city_data[columns_numeric].count()

lat    7322
lng    7322
pop    7322
dtype: int64

In [29]:
city_data[columns_numeric].median()

lat       26.792730
lng       18.617509
pop    61322.750000
dtype: float64

In [30]:
city_data[columns_numeric].quantile()

lat       26.792730
lng       18.617509
pop    61322.750000
Name: 0.5, dtype: float64

In [31]:
city_data[columns_numeric].sum(axis = 1).head()

0      3095.116300
1     15099.766702
2    201641.942998
3     49943.998999
4     10098.499997
dtype: float64

In [32]:
city_data[columns_numeric].describe()
#calculate the most important statistics for numerical data in one go

Unnamed: 0,lat,lng,pop
count,7322.0,7322.0,7322.0
mean,20.662876,10.711914,265463.1
std,29.134818,79.044615,828762.2
min,-89.982894,-179.589979,-99.0
25%,-0.32471,-64.788472,17344.25
50%,26.79273,18.617509,61322.75
75%,43.575448,73.103628,200172.6
max,82.483323,179.383304,22006300.0


Concatenating data frames

In [34]:
city_data1 = city_data.sample(3)
city_data2 = city_data.sample(3)
city_data_combine = pd.concat([city_data1, city_data2])
city_data_combine

Unnamed: 0,city,city_ascii,lat,lng,pop,country,iso2,iso3,province
3972,Linares,Linares,24.86038,-99.570031,52349.5,Mexico,MX,MEX,Nuevo León
590,Hobart,Hobart,-42.850009,147.29503,64285.0,Australia,AU,AUS,Tasmania
2403,Zagazig,Zagazig,30.583332,31.51666,285097.0,Egypt,EG,EGY,Ash Sharqiyah
5224,Bagdarin,Bagdarin,54.433304,113.600032,4676.0,Russia,RU,RUS,Buryat
2652,Braunschweig,Braunschweig,52.249975,10.50002,239884.5,Germany,DE,DEU,Niedersachsen
5711,El Fasher,El Fasher,13.629981,25.350018,220906.0,Sudan,SD,SDN,Northern Darfur


In [36]:
df1 = pd.DataFrame({'col1': ['col10','col11', 'col12','col13'],
                   'col2': ['col20','col21','col22','col23'],
                   'col3': ['col30','col31','col32','col33'],
                   'col4': ['col40','col41','col42','col43']},
                  index=[0,1,2,3])
df1

Unnamed: 0,col1,col2,col3,col4
0,col10,col20,col30,col40
1,col11,col21,col31,col41
2,col12,col22,col32,col42
3,col13,col23,col33,col43


In [37]:
df4 = pd.DataFrame({'col2': ['col22', 'col23', 'col26', 'col27'],
                    'Col4': ['Col42', 'Col43', 'Col46', 'Col47'],
                    'col6': ['col62', 'col63', 'col66', 'col67']},
                   index=[2, 3, 6, 7])
df4

Unnamed: 0,col2,Col4,col6
2,col22,Col42,col62
3,col23,Col43,col63
6,col26,Col46,col66
7,col27,Col47,col67


In [38]:
pd.concat([df1,df4], axis=1)

Unnamed: 0,col1,col2,col3,col4,col2.1,Col4,col6
0,col10,col20,col30,col40,,,
1,col11,col21,col31,col41,,,
2,col12,col22,col32,col42,col22,Col42,col62
3,col13,col23,col33,col43,col23,Col43,col63
6,,,,,col26,Col46,col66
7,,,,,col27,Col47,col67


In [39]:
country_data = city_data[['iso3','country']].drop_duplicates()
country_data.shape

(223, 2)

In [40]:
country_data.head()

Unnamed: 0,iso3,country
0,AFG,Afghanistan
33,ALD,Aland
34,ALB,Albania
60,DZA,Algeria
111,ASM,American Samoa


In [41]:
del(city_data['country'])

In [42]:
city_data.merge(country_data, 'inner').head()

Unnamed: 0,city,city_ascii,lat,lng,pop,iso2,iso3,province,country
0,Qal eh-ye Now,Qal eh-ye,34.983,63.1333,2997.0,AF,AFG,Badghis,Afghanistan
1,Chaghcharan,Chaghcharan,34.516701,65.250001,15000.0,AF,AFG,Ghor,Afghanistan
2,Lashkar Gah,Lashkar Gah,31.582998,64.36,201546.0,AF,AFG,Hilmand,Afghanistan
3,Zaranj,Zaranj,31.112001,61.886998,49851.0,AF,AFG,Nimroz,Afghanistan
4,Tarin Kowt,Tarin Kowt,32.633298,65.866699,10000.0,AF,AFG,Uruzgan,Afghanistan


## Scikit-learn

https://scikit-learn.org/stable/

In [43]:
from sklearn import datasets

In [44]:
diabetes = datasets.load_diabetes()
X = diabetes.data[:10]
y = diabetes.target
X[:5]

array([[ 0.03807591,  0.05068012,  0.06169621,  0.02187235, -0.0442235 ,
        -0.03482076, -0.04340085, -0.00259226,  0.01990842, -0.01764613],
       [-0.00188202, -0.04464164, -0.05147406, -0.02632783, -0.00844872,
        -0.01916334,  0.07441156, -0.03949338, -0.06832974, -0.09220405],
       [ 0.08529891,  0.05068012,  0.04445121, -0.00567061, -0.04559945,
        -0.03419447, -0.03235593, -0.00259226,  0.00286377, -0.02593034],
       [-0.08906294, -0.04464164, -0.01159501, -0.03665645,  0.01219057,
         0.02499059, -0.03603757,  0.03430886,  0.02269202, -0.00936191],
       [ 0.00538306, -0.04464164, -0.03638469,  0.02187235,  0.00393485,
         0.01559614,  0.00814208, -0.00259226, -0.03199144, -0.04664087]])

In [45]:
y[:10]

array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310.])

In [46]:
feature_names = ['age','sex','bmi','bp','s1','s2','s3','s4','s5','s6']

Scikit example regression

In [48]:
from sklearn import datasets
from sklearn.linear_model import Lasso

from sklearn import linear_model, datasets
from sklearn.model_selection import GridSearchCV

diabetes = datasets.load_diabetes()
X_train = diabetes.data[:310]
Y_train = diabetes.target[:310]

X_test = diabetes.data[310:]
Y_test = diabetes.data[310:]

lasso = Lasso(random_state=0)
alphas = np.logspace(-4, -0.5, 30)

score = list()
score_std = list()

estimator = GridSearchCV(lasso, param_grid = dict(alpha=alphas))
estimator.fit(X_train, Y_train)

GridSearchCV(estimator=Lasso(random_state=0),
             param_grid={'alpha': array([1.00000000e-04, 1.32035178e-04, 1.74332882e-04, 2.30180731e-04,
       3.03919538e-04, 4.01280703e-04, 5.29831691e-04, 6.99564216e-04,
       9.23670857e-04, 1.21957046e-03, 1.61026203e-03, 2.12611233e-03,
       2.80721620e-03, 3.70651291e-03, 4.89390092e-03, 6.46167079e-03,
       8.53167852e-03, 1.12648169e-02, 1.48735211e-02, 1.96382800e-02,
       2.59294380e-02, 3.42359796e-02, 4.52035366e-02, 5.96845700e-02,
       7.88046282e-02, 1.04049831e-01, 1.37382380e-01, 1.81393069e-01,
       2.39502662e-01, 3.16227766e-01])})

In [49]:
estimator.best_score_

0.46170948106181975

In [50]:
estimator.best_estimator_

Lasso(alpha=0.07880462815669913, random_state=0)

In [51]:
estimator.predict(X_test)

array([199.93864949, 178.15721853, 123.03190244, 212.49049201,
       171.7225727 , 118.00948701, 201.00114035, 170.21721531,
       163.34131121, 183.90933367, 190.88238488, 278.22988324,
       288.52365106, 233.6560703 , 206.83259982, 227.77107101,
       156.62884613, 222.71162783, 187.99029083, 104.47239575,
       169.94555578, 110.98338081, 282.0855266 , 176.77816916,
        80.93962648,  87.75752311, 251.78458672, 163.64725803,
       123.67641457, 157.33359183, 160.41494649, 180.74753017,
       163.06411793, 154.46342209, 145.51419402, 130.61730734,
       188.10792017, 108.33007609, 128.07146849,  90.08085621,
       248.91715304,  86.45376283,  59.61600068, 189.70269521,
       208.04229933, 129.45568293,  95.70956072, 200.75788665,
        56.99031227, 169.76616519, 190.24966488, 120.81802108,
       227.80814   , 157.06636369, 158.42570567, 162.67629743,
       259.95041168, 259.48001858, 204.01293884, 182.123241  ,
        63.63372524, 213.62386558, 111.77752804, 137.49

# Deep learning frameworks

In [3]:
pip install --upgrade numpy==1.20.3

Collecting numpy==1.20.3
  Downloading numpy-1.20.3-cp39-cp39-macosx_10_9_x86_64.whl (16.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.1/16.1 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.22.1
    Uninstalling numpy-1.22.1:
      Successfully uninstalled numpy-1.22.1
Successfully installed numpy-1.20.3
You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.9/bin/python3 -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [1]:
# Theano example

import numpy


import theano.tensor as T
from theano import function



In [2]:
x = T.dscalar('x')
y = T.dscalar('y')
z = x + y

In [3]:
f = function([x,y],z)
f(8,2)

array(10.)

In [9]:
# Tensorflow example  https://www.tensorflow.org/
import tensorflow as tf
tf.compat.v1.disable_eager_execution()
hello = tf.constant('Hello, TensorFlow!')
sess = tf.compat.v1.Session()
print(sess.run(hello))

b'Hello, TensorFlow!'


Building a neural network model with Keras

In [18]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

X_train = cancer.data[:340]
Y_train = cancer.target[:340]

X_test = cancer.data[340:]
Y_test = cancer.target[340:]

import numpy as np
from keras import Sequential
from keras.layers import Dense, Dropout

In [19]:
model = Sequential()
model.add(Dense(15, input_dim=30, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [20]:
model.compile(loss='binary_crossentropy',
             optimizer='rmsprop',
             metrics=['accuracy'])

In [21]:
model.fit(X_train, Y_train,
         epochs=20,
         batch_size=50)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fe4af86ddc0>

In [32]:
predictions = (model.predict(X_test) > 0.5).astype("int32")

In [33]:
from sklearn import metrics

print('Accuracy:', metrics.accuracy_score(y_true=Y_test, y_pred=predictions))
print(metrics.classification_report(y_true=Y_test, y_pred=predictions))

Accuracy: 0.8427947598253275
              precision    recall  f1-score   support

           0       0.63      0.85      0.72        55
           1       0.95      0.84      0.89       174

    accuracy                           0.84       229
   macro avg       0.79      0.85      0.81       229
weighted avg       0.87      0.84      0.85       229



THe power of deep learning models

In [34]:
model = Sequential()
model.add(Dense(15, input_dim=30, activation='relu'))
model.add(Dense(15, activation='relu'))
model.add(Dense(15, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
             optimizer='rmsprop',
             metrics=['accuracy'])

model.fit(X_train, Y_train,
         epochs=20,
         batch_size=50)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fe48ff06a00>

In [35]:
predictions = (model.predict(X_test) > 0.5).astype("int32")

In [36]:
print('Accuracy:', metrics.accuracy_score(y_true=Y_test, y_pred=predictions))
print(metrics.classification_report(y_true=Y_test, y_pred=predictions))

Accuracy: 0.925764192139738
              precision    recall  f1-score   support

           0       0.85      0.84      0.84        55
           1       0.95      0.95      0.95       174

    accuracy                           0.93       229
   macro avg       0.90      0.90      0.90       229
weighted avg       0.93      0.93      0.93       229



# Natural language tool kit

In [10]:
pip install nltk

Collecting nltk
  Downloading nltk-3.7-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting regex>=2021.8.3
  Downloading regex-2022.7.25-cp39-cp39-macosx_10_9_x86_64.whl (289 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.7/289.7 KB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting tqdm
  Downloading tqdm-4.64.0-py2.py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.4/78.4 KB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tqdm, regex, nltk
Successfully installed nltk-3.7 regex-2022.7.25 tqdm-4.64.0
You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.9/bin/python3 -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [1]:
import nltk

In [None]:
nltk.download()