In [2]:
import pandas as pd
import numpy as np

In [3]:
# Set random seed for reproducibility
np.random.seed(42)

# Generate 50 social media follower counts between 500,000 and 5,000,000
followers = np.random.randint(500_000, 5_000_000, size=50).astype(float)

# Randomly set 5 values to NaN
nan_indices = np.random.choice(range(50), size=5, replace=False)
followers[nan_indices] = np.nan

# Generate random sold_out values (0 or 1)
sold_out = np.random.choice([0, 1], size=50)

# Create the DataFrame
df = pd.DataFrame({
    'Social_media_followers': followers,
    'Sold_out': sold_out
})

In [4]:
df

Unnamed: 0,Social_media_followers,Sold_out
0,2192743.0,1
1,4804572.0,0
2,2734489.0,0
3,2070006.0,1
4,1636074.0,1
5,,0
6,1739911.0,0
7,4972471.0,1
8,2638242.0,1
9,2266891.0,1


In [5]:
X1 = df[['Social_media_followers']]

In [6]:
X1

Unnamed: 0,Social_media_followers
0,2192743.0
1,4804572.0
2,2734489.0
3,2070006.0
4,1636074.0
5,
6,1739911.0
7,4972471.0
8,2638242.0
9,2266891.0


In [7]:
y1 = df[['Sold_out']]

In [8]:
from sklearn.model_selection import train_test_split

In [10]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, random_state=11, test_size=0.3)

In [11]:
X1_train.shape

(35, 1)

In [12]:
X1_test.shape

(15, 1)

In [13]:
from sklearn.impute import SimpleImputer 

In [14]:
#NaN
imputer = SimpleImputer(strategy = 'mean')

In [15]:
from sklearn.linear_model import LogisticRegression

In [17]:
lr = LogisticRegression()

In [18]:
from sklearn.pipeline import make_pipeline

In [19]:
pipe1 = make_pipeline(imputer, lr)

In [20]:
pipe1.fit(X1_train, y1_train)

  y = column_or_1d(y, warn=True)


In [21]:
pipe1.score(X1_train, y1_train)

0.6571428571428571

In [22]:
pipe1.score(X1_test, y1_test)


0.6666666666666666

In [23]:
pipe1.named_steps.simpleimputer.statistics_

array([2998870.18181818])

In [25]:
pipe1.named_steps.logisticregression.coef_


array([[-1.98162013e-07]])

In [27]:
#more advanced pipeline

np.random.seed(42)

genres = ['Rock', 'Pop', 'Metal', 'Jazz', 'Hip-hop', 'Bluegrass', 'Country', 'Electronic', 'Classical']

# 9 thể loại + 1 NaN = 10 mục, mỗi mục chiếm ~0.1 (hoặc bạn có thể điều chỉnh)
probabilities = [0.11]*9 + [0.01]  # Tổng là 0.99 + 0.01 = 1.0

genre_col = np.random.choice(genres + [np.nan], size=50, p=probabilities)

followers_col = np.random.randint(500_000, 5_000_000, size=50).astype(float)
nan_indices = np.random.choice(range(50), size=5, replace=False)
followers_col[nan_indices] = np.nan

sold_out_col = np.random.choice([0, 1], size=50)

df = pd.DataFrame({
    'Genre': genre_col,
    'Social_media_followers': followers_col,
    'Sold_out': sold_out_col
})

In [29]:
df

Unnamed: 0,Genre,Social_media_followers,Sold_out
0,Jazz,4084702.0,1
1,Classical,2595505.0,0
2,Country,2799435.0,0
3,Bluegrass,696769.0,0
4,Pop,3158505.0,0
5,Pop,1771741.0,1
6,Rock,4269315.0,1
7,Electronic,,0
8,Bluegrass,3547262.0,0
9,Country,2084398.0,1


In [30]:
X = df.iloc[:, 0:2]

In [31]:
y = df.iloc[:, 2]

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=11, test_size=0.3)

In [51]:
num_cols = ['Social_media_followers']

In [53]:
cat_cols = ['Genre']

In [54]:
from sklearn.pipeline import Pipeline

In [55]:
from sklearn.preprocessing import StandardScaler

In [56]:
num_pipeline = Pipeline(steps = [
                        ('impute', SimpleImputer(strategy='mean')),
                        ('scale', StandardScaler())
]
)

In [57]:
from sklearn.preprocessing import OneHotEncoder

In [58]:
cat_pipeline = Pipeline( steps = [
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('one-hot-encoder', OneHotEncoder(handle_unknown='ignore', sparse_output = False))
]
    
                       )

In [59]:
from sklearn.compose import ColumnTransformer

In [60]:
col_transformer = ColumnTransformer( transformers = [
    ('num_pipeline', num_pipeline, num_cols),
    ('cat_pipeline', cat_pipeline, cat_cols),
], 
    remainder = 'drop',
    n_jobs = -1
)

In [61]:
from sklearn.tree import DecisionTreeClassifier

In [62]:
dtc = DecisionTreeClassifier()

In [63]:
pipefinal = make_pipeline(col_transformer, dtc)

In [64]:
pipefinal.fit(X_train, y_train)

In [65]:
 pipefinal.score(X_test, y_test)

0.4

In [66]:
#how to save pipeline
import joblib

In [69]:
joblib.dumb(pipefinal, "pipe.joblib")

AttributeError: module 'joblib' has no attribute 'dumb'