51

In [2]:
import numpy as np
import pandas as pd

from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier


np.random.seed(42)
raw_data = make_moons(n_samples=2000, noise=0.25, random_state=42)
data = raw_data[0]
target = raw_data[1]

X_train, X_test, y_train, y_test = train_test_split(data, target)

tree = RandomForestClassifier(random_state=42)

tree.fit(X_train, y_train)
acc = tree.score(X_test, y_test)
print(f"Accuracy: {acc:.4f}")

Accuracy: 0.9300


52

In [10]:
import numpy as np
import pandas as pd

from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV


np.random.seed(42)
raw_data = make_moons(n_samples=2000, noise=0.25, random_state=42)
data = raw_data[0]
target = raw_data[1]

X_train, X_test, y_train, y_test = train_test_split(data, target)
cls = RandomForestClassifier(random_state=42)
param_grid = {"criterion": ["gini", "entropy"], "max_depth": [6,7,8], "min_samples_leaf": [4,5]}
search = GridSearchCV(cls, scoring="accuracy", param_grid=param_grid, cv=2)
search.fit(X_train, y_train)
# search.transform(X_test, y_test)
search.score(X_test, y_test)
search.best_params_

{'criterion': 'gini', 'max_depth': 8, 'min_samples_leaf': 4}

53

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
documents = [
    'python is a programming language',
    'python is popular',
    'programming in python',
    'object-oriented programming in python'
]

vec = CountVectorizer()
data = vec.fit_transform(documents)

df = pd.DataFrame(data=data.toarray())
df.columns = vec.get_feature_names_out()
df

Unnamed: 0,in,is,language,object,oriented,popular,programming,python
0,0,1,1,0,0,0,1,1
1,0,1,0,0,0,1,0,1
2,1,0,0,0,0,0,1,1
3,1,0,0,1,1,0,1,1


54

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

documents = [
    'python is a programming language',
    'python is popular',
    'programming in python',
    'object-oriented programming in python'
]

vec = CountVectorizer(stop_words="english")
data = vec.fit_transform(documents)
df = pd.DataFrame(data=data.toarray())
df.columns = vec.get_feature_names_out()
df

Unnamed: 0,language,object,oriented,popular,programming,python
0,1,0,0,0,1,1
1,0,0,0,1,0,1
2,0,0,0,0,1,1
3,0,1,1,0,1,1


55

In [21]:
documents = [
    'python is a programming language',
    'python is popular',
    'programming in python',
    'object-oriented programming in python'
]

vec = CountVectorizer(stop_words="english", ngram_range=(1,2))
data = vec.fit_transform(documents)
df = pd.DataFrame(data=data.toarray())
df.columns = vec.get_feature_names_out()
df

Unnamed: 0,language,object,object oriented,oriented,oriented programming,popular,programming,programming language,programming python,python,python popular,python programming
0,1,0,0,0,0,0,1,1,0,1,0,1
1,0,0,0,0,0,1,0,0,0,1,1,0
2,0,0,0,0,0,0,1,0,1,1,0,0
3,0,1,1,1,1,0,1,0,1,1,0,0


56

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

pd.options.display.precision = 3

documents = [
    'python is a programming language',
    'python is popular',
    'programming in python',
    'object-oriented programming in python',
    'programming language'
]

vectorizer = TfidfVectorizer()
data = vectorizer.fit_transform(documents)
df = pd.DataFrame(data=data.toarray(), columns=vectorizer.get_feature_names_out())
df

Unnamed: 0,in,is,language,object,oriented,popular,programming,python
0,0.0,0.58,0.58,0.0,0.0,0.0,0.405,0.405
1,0.0,0.575,0.0,0.0,0.0,0.713,0.0,0.402
2,0.712,0.0,0.0,0.0,0.0,0.0,0.497,0.497
3,0.445,0.0,0.0,0.552,0.552,0.0,0.311,0.311
4,0.0,0.0,0.82,0.0,0.0,0.0,0.573,0.0


57

In [27]:
documents = [
    'python is a programming language',
    'python is popular',
    'programming in python',
    'object-oriented programming in python',
    'programming language'
]

vectorizer = TfidfVectorizer(stop_words=["is", "in"])
data = vectorizer.fit_transform(documents)
df = pd.DataFrame(data=data.toarray(), columns=vectorizer.get_feature_names_out())
df

Unnamed: 0,language,object,oriented,popular,programming,python
0,0.712,0.0,0.0,0.0,0.497,0.497
1,0.0,0.0,0.0,0.871,0.0,0.491
2,0.0,0.0,0.0,0.0,0.707,0.707
3,0.0,0.616,0.616,0.0,0.347,0.347
4,0.82,0.0,0.0,0.0,0.573,0.0


58

In [70]:
import random
from numpy.linalg import norm
df = pd.read_csv("/content/data.csv")

maxes = df.max(axis=0)
mins = df.min(axis=0)

centroid_x1 = np.array([
    random.uniform(mins.x1, maxes.x1),
    random.uniform(mins.x2, maxes.x2)])
centroid_x2 = np.array([
    random.uniform(mins.x1, maxes.x1),
    random.uniform(mins.x2, maxes.x2)])

data = df.values

for _ in range(10):
  clusters = []
  for point in data:
      dist1 = norm(centroid_x1 - point)
      dist2 = norm(centroid_x2 - point)
      if dist1 < dist2:
        clusters.append(1)
      else:
        clusters.append(2)

  df["cluster"] = clusters

  centroid_x1 = [
      round(df[df.cluster == 1].x1.mean(), 3),
      round(df[df.cluster == 1].x2.mean(), 3)
  ]

  centroid_x2 = [
      round(df[df.cluster == 2].x1.mean(), 3),
      round(df[df.cluster == 2].x2.mean(), 3)
  ]

print(centroid_x1)
print(centroid_x2)

[2.468 4.093]
[1.387 1.2  ]
[0.352, 2.502]
[2.663, -3.083]


59

In [75]:
from sklearn.cluster import KMeans

df = pd.read_csv("/content/clusters.csv")
k_means = KMeans(n_clusters=3, max_iter=1000, random_state=42)
k_means.fit(df)
k_means.cluster_centers_

  super()._check_params_vs_input(X, default_n_init=10)


array([[-2.157, -4.305],
       [ 4.867,  0.424],
       [-0.555, -0.33 ]])

60

In [79]:
from sklearn.cluster import KMeans

df = pd.read_csv("/content/clusters.csv")
k_means = KMeans(n_clusters=3, max_iter=1000, random_state=42)
df["y_kmeans"] = k_means.fit_predict(df)
df.head(10)


  super()._check_params_vs_input(X, default_n_init=10)


Unnamed: 0,x1,x2,y_kmeans
0,-2.776,-4.167,0
1,-1.336,-1.084,2
2,6.507,-0.159,1
3,-0.957,0.235,2
4,-1.558,-3.97,0
5,-0.652,-1.333,2
6,5.561,1.517,1
7,-0.891,-3.456,0
8,6.391,3.597,1
9,5.813,-0.846,1
