# Scikit-learn

## 1. Importing scikit-learn:

In [1]:
import sklearn

## 2. Loading Datasets:

In [2]:
from sklearn.datasets import load_iris

iris = load_iris()
X, y = iris.data, iris.target


## 3. Train-Test Split:

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## 4. Creating a Model - Example: Decision Tree:

In [4]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)


## 5. Making Predictions:

In [5]:
y_pred = clf.predict(X_test)


## 6. Evaluating Model Performance - Classification:

In [6]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)


## 7. Evaluating Model Performance - Regression:

In [7]:
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


## 8. Scaling Features - StandardScaler:
Standardize features by removing the mean and scaling to unit variance.

In [8]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


## 9. Pipeline Construction:
Build a pipeline for preprocessing and modeling.

In [10]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', DecisionTreeClassifier())
])
pipeline.fit(X_train, y_train)


In [11]:
## 10. Hyperparameter Tuning - GridSearchCV:
Search for the best hyperparameters using cross-validation.

SyntaxError: invalid syntax (2784764143.py, line 2)

In [12]:
from sklearn.model_selection import GridSearchCV

param_grid = {'max_depth': [3, 5, 7]}
grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_


## 11. Cross-Validation:
Perform cross-validation to assess model performance.

In [13]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(clf, X, y, cv=5)
mean_accuracy = scores.mean()


## 12. Feature Selection - SelectKBest:
Select top k features based on statistical tests.

In [14]:
from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(score_func=f_classif, k=2)
X_train_selected = selector.fit_transform(X_train, y_train)


## 13. Feature Extraction - PCA (Principal Component Analysis):
Reduce dimensionality using PCA.

In [15]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)


## 14. K-Means Clustering:
Perform k-means clustering.

In [16]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3)
cluster_labels = kmeans.fit_predict(X)




## 15. Model Persistence - Saving and Loading:
Save and load trained models.

In [17]:
from sklearn.externals import joblib

joblib.dump(clf, 'model.pkl')
loaded_model = joblib.load('model.pkl')


ImportError: cannot import name 'joblib' from 'sklearn.externals' (C:\Users\Rajes\anaconda3\Lib\site-packages\sklearn\externals\__init__.py)

## 16. Ensemble Methods - Random Forest:
Use an ensemble method like Random Forest.

In [18]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)


## 17. Support Vector Machines (SVM):
Train a Support Vector Machine model.

In [19]:
from sklearn.svm import SVC

svm = SVC(kernel='linear')
svm.fit(X_train, y_train)


## 18. Grid Search with Pipeline:
Perform hyperparameter tuning using GridSearchCV with a pipeline.

In [20]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'clf__max_depth': [3, 5, 7],
    'clf__min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(pipeline, param_grid, cv=5)
grid_search.fit(X_train, y_train)


## 19. Text Processing - CountVectorizer and TfidfVectorizer:
Convert text data into numerical features.

In [21]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = CountVectorizer()
X_text = vectorizer.fit_transform(text_data)


NameError: name 'text_data' is not defined

## 20. Time Series - TimeSeriesSplit:
Split time series data for cross-validation.

In [22]:
from sklearn.model_selection import TimeSeriesSplit

tscv = TimeSeriesSplit(n_splits=5)
for train_index, test_index in tscv.split(X):
    X_train, X_test = X[train_index], X[test_index]


# END