In [2]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
%matplotlib inline 

In [3]:
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Gender,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [19]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import accuracy_score, classification_report
import pickle
import joblib
# Load the data
data = pd.read_csv('train.csv')
X = data.drop('Survived', axis=1)
y = data['Survived']

# Define feature groups
numeric_features = ['Age', 'Fare', 'SibSp', 'Parch']
categorical_features = ['Pclass', 'Sex', 'Embarked']
ordinal_features = ['Pclass']
nominal_features = ['Sex', 'Embarked']

# Custom transformer for feature construction
class FeatureConstructor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        # Extract title from name
        X['Title'] = X['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
        rare_titles = ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']
        X['Title'] = X['Title'].replace(rare_titles, 'Rare')
        # Family size
        X['FamilySize'] = X['SibSp'] + X['Parch'] + 1
        # Is alone
        X['IsAlone'] = (X['FamilySize'] == 1).astype(int)
        # Fare per person
        X['FarePerPerson'] = X['Fare'] / X['FamilySize']
        # Age * Class
        X['Age*Class'] = X['Age'] * X['Pclass']
        return X

# Numeric pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical pipeline
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Ordinal pipeline
ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder())
])

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, nominal_features),
        ('ord', ordinal_transformer, ordinal_features)
    ])

# Feature engineering pipeline
feature_engineering = Pipeline([
    # ('constructor', FeatureConstructor()),
    ('preprocessor', preprocessor),
    ('selector', SelectKBest(f_classif, k=15))
])

# Define classifiers
rf_classifier = RandomForestClassifier(random_state=42)
lr_classifier = LogisticRegression(random_state=42)
svm_classifier = SVC(probability=True, random_state=42)
gb_classifier = GradientBoostingClassifier(random_state=42)

# Ensemble - Voting Classifier
ensemble_classifier = VotingClassifier(
    estimators=[
        ('rf', rf_classifier),
        ('lr', lr_classifier),
        ('svm', svm_classifier),
        ('gb', gb_classifier)
    ],
    voting='soft'
)

# Final pipeline with ensemble model
final_pipeline = Pipeline([
    ('features', feature_engineering),
    ('classifier', ensemble_classifier)
])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the ensemble model
print("\nTraining Ensemble Model...")
final_pipeline.fit(X_train, y_train)

# Evaluate on the test set
y_pred = final_pipeline.predict(X_test)
print("\nEnsemble Model Performance on Test Set:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Save the trained ensemble model
with open("model3.pkl", "wb") as f:
    joblib.dump(final_pipeline, f)



Training Ensemble Model...





Ensemble Model Performance on Test Set:
Accuracy: 0.8101
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.88      0.84       105
           1       0.80      0.72      0.76        74

    accuracy                           0.81       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179



In [20]:
import joblib
from processing import FeatureConstructor


# Load the trained model
with open("model3.pkl", "rb") as f:
    model = joblib.load(f)

print(model)

Pipeline(steps=[('features',
                 Pipeline(steps=[('preprocessor',
                                  ColumnTransformer(transformers=[('num',
                                                                   Pipeline(steps=[('imputer',
                                                                                    SimpleImputer(strategy='median')),
                                                                                   ('scaler',
                                                                                    StandardScaler())]),
                                                                   ['Age',
                                                                    'Fare',
                                                                    'SibSp',
                                                                    'Parch']),
                                                                  ('cat',
                                                     

In [18]:
with open("model2.pkl", "rb") as f:
    model = joblib.load(f)

print(model)

Pipeline(steps=[('features',
                 Pipeline(steps=[('constructor', FeatureConstructor()),
                                 ('preprocessor',
                                  ColumnTransformer(transformers=[('num',
                                                                   Pipeline(steps=[('imputer',
                                                                                    SimpleImputer(strategy='median')),
                                                                                   ('scaler',
                                                                                    StandardScaler())]),
                                                                   ['Age',
                                                                    'Fare',
                                                                    'SibSp',
                                                                    'Parch']),
                                                       

In [22]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import pickle

# Load the Titanic dataset
df = pd.read_csv('train.csv')

# Separate features and target
X = df.drop('Survived', axis=1)
y = df['Survived']

# Identify numerical, categorical, and discrete columns
num_cols = ['Age', 'Fare']
cat_cols = ['Pclass', 'Sex', 'Embarked']
disc_cols = ['SibSp', 'Parch']

# Create preprocessing pipelines
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder())
])

disc_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent'))
])

# Combine pipelines into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_cols),
        ('cat', cat_pipeline, cat_cols),
        ('disc', disc_pipeline, disc_cols)
    ]
)

# Create an ensemble model
ensemble = VotingClassifier(
    estimators=[
        ('lr', LogisticRegression()),
        ('dt', DecisionTreeClassifier()),
        ('knn', KNeighborsClassifier())
    ],
    voting='soft'
)

# Create the final pipeline
model_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('model', ensemble)
])

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline to the training data
model_pipeline.fit(X_train, y_train)

# Evaluate the model on the test set
test_score = model_pipeline.score(X_test, y_test)
print(f'Test Accuracy: {test_score:.2f}')

# Save the trained model to a pickle file
with open('titanic_model.pkl', 'wb') as f:
    pickle.dump(model_pipeline, f)

Test Accuracy: 0.83


In [26]:
# It returns the value on the fly, 
def my_generator():
    for i in range(5): # If value is say 5Cr so instead of storing the list, we can retuirn it when called
        # Ex Complex computations
        yield i # Lazy hote hai 
    
gen = my_generator()
print(next(gen))
print(next(gen))
print(next(gen))

0


In [25]:
import pandas as pd

def read_csv_in_batches(file_path, batch_size=3):
    for chunk in pd.read_csv(file_path, chunksize=batch_size):
        yield chunk

# Usage example
file_path = 'train.csv'
batch_size = 5  # Adjust this to the desired number of rows per batch

for batch in read_csv_in_batches(file_path, batch_size):
    # Process each batch (batch is a DataFrame)
    print(batch.index)
    # Perform any operations you need on each batch


RangeIndex(start=0, stop=5, step=1)
RangeIndex(start=5, stop=10, step=1)
RangeIndex(start=10, stop=15, step=1)
RangeIndex(start=15, stop=20, step=1)
RangeIndex(start=20, stop=25, step=1)
RangeIndex(start=25, stop=30, step=1)
RangeIndex(start=30, stop=35, step=1)
RangeIndex(start=35, stop=40, step=1)
RangeIndex(start=40, stop=45, step=1)
RangeIndex(start=45, stop=50, step=1)
RangeIndex(start=50, stop=55, step=1)
RangeIndex(start=55, stop=60, step=1)
RangeIndex(start=60, stop=65, step=1)
RangeIndex(start=65, stop=70, step=1)
RangeIndex(start=70, stop=75, step=1)
RangeIndex(start=75, stop=80, step=1)
RangeIndex(start=80, stop=85, step=1)
RangeIndex(start=85, stop=90, step=1)
RangeIndex(start=90, stop=95, step=1)
RangeIndex(start=95, stop=100, step=1)
RangeIndex(start=100, stop=105, step=1)
RangeIndex(start=105, stop=110, step=1)
RangeIndex(start=110, stop=115, step=1)
RangeIndex(start=115, stop=120, step=1)
RangeIndex(start=120, stop=125, step=1)
RangeIndex(start=125, stop=130, step=1)
Ra