## Pandas


In [None]:
import pandas as pd

df = pd.DataFrame(iris.data, columns=iris.feature_names)
df = pd.read_csv("income.csv", index_col=None, names=["income", "count"], skiprows=1)
df.to_csv("output.csv", index=False)

numeric_columns = df.select_dtypes(include="number")


# Create new column
df["target"] = iris.target

pd.crosstab(df.salary, df.left).plot(kind="bar")


inputs.groupby("Age").mean()
df.groupby("Category").describe()

from word2number import w2n

df.experience = df["experience"].apply(w2n.word_to_num)

#### Data Exploration


In [None]:
df
df.sample(5)
df.head(5)
df[df.column == 1].head()
df[45:55]
df.loc[30]
df.shape
df.columns
df.nunique()
df.describe()
df["column"].describe()
df["column"].unique()
df["column"].value_counts(ascending=False)
df["column"].values
df1 = df.copy()

#### Features selection


In [None]:
df = df.drop(["column"], axis=1)
df.drop(["column1", "column2"], axis=1, inplace=True)

#### Feature Engineering


#### 1) Handle na values


In [None]:
df.isna().sum()
df.dropna(inplace=True)
df = df[df.column.notnull()]
df[cols_to_fill_zero] = df[cols_to_fill_zero].fillna(0)
df["column"].fillna(df["column"].mean(), inplace=True)
df.column = df.column.fillna(df.column.median())

#### 2) Handle categorical values


In [None]:
# Dimensionality Reduction (Rename minor categories as others to reduce no of categories)
df5.location = df5.location.apply(
    lambda x: "other" if x in location_stats_less_than_10 else x
)

# Creating dummy Variables
dummies = pd.get_dummies(df.column)
dummies = dummies.astype("int")
merged = pd.concat([df, dummies], axis="columns")

df = pd.get_dummies(df, dtype=int, drop_first=True)

#### 3) Handle text values


In [None]:
# 4 Bedroom	-> 4
df3["bhk"] = df3["size"].apply(lambda x: int(x.split(" ")[0]))
df3.bhk.unique()

# spam -> 1
df["spam"] = df["Category"].apply(lambda x: 1 if x == "spam" else 0)


# explore other than number formats in the feature
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True


df3[~df3["total_sqft"].apply(is_float)]


# 2100 - 2850  ->  2475
def convert_sqft_to_num(x):
    tokens = x.split("-")
    if len(tokens) == 2:
        return (float(tokens[0]) + float(tokens[1])) / 2
    try:
        return float(x)
    except:
        return None

#### 4) Handle outliers


Normal Distribution - data points forms a bell curve in histogram plot

Mean & Standard Deviation - Referral points in normal distributed curve

ZScore - A math & py function which calculates how many sd a data point away from the mean

(Math formula-> Datapoint-mean / sd  
py method ->

For example if mean is 66.37 and standard deviation is 3.84.
If a value of a data point is 77.91 then Z score for that is 3 because it is 3 standard deviation away (77.91 = 66.37 + 3 \* 3.84)
)


In [None]:
dff['column'].describe()
df.shape[0] - df2.shape[0]


#plot histogram to see data distribution

import matplotlib.pyplot as plt
from scipy.stats import norm
import numpy as np
plt.xlabel("Price per square ft")
plt.ylabel("Count")
plt.hist(df4.price_per_sqft, bins=20, rwidth=0.8, density=True)
rng = np.arange(-5000, df4.price_per_sqft.max(), 100)
plt.plot(rng, norm.pdf(rng, df4.price_per_sqft.mean(), df4.price_per_sqft.std()))

import seaborn as sn
sn.histplot(df.price_per_sqft, kde=True)

# Percentile
min_thresold, max_thresold = df.price.quantile([0.001,0.999]) #0.1% and 99.9% / [0.01,0.99] #1% and 99%
df[df.price<min_thresold]
df[df.price<max_thresold]
df2 = df[(df.price>min_thresold)&(df.price<max_thresold)]

#Standard Deviation
upper_limit = df['column'].mean() + 3*df['column'].std()
lower_limit = df['column'].mean() - 3*df['column'].std()
df[(df['column']>upper_limit) | (df['column']<lower_limit)]
df_no_outlier_std_dev = df[(df['column']>lower_limit) & (df['column']<upper_limit)]

#ZScore
df['zscore'] = ( df['column'] - df['column'].mean() ) / df['column'].std()
df[df['zscore']>3]
df[df['zscore']<-3]
df[(df.zscore<-3) | (df.zscore>3)]
df_no_outliers = df[(df.zscore>-3) & (df.zscore<3)]

#IQR
Q1 = df.height.quantile(0.25)
Q3 = df.height.quantile(0.75)
IQR = Q3 - Q1
lower_limit = Q1 - 1.5*IQR
upper_limit = Q3 + 1.5*IQR
df[(df.height<lower_limit)|(df.height>upper_limit)]
df_no_outlier = df[(df.height>lower_limit)&(df.height<upper_limit)]

## Numpy


In [None]:
import numpy as np

x = np.array([1, 2, 3, 4, 5])

## Matplotlib


In [None]:
import matplotlib.pyplot as plt

matplotlib.rcParams["figure.figsize"] = (20, 10)

plt.title("Flowers")
plt.xlabel("Sepal Length")
plt.ylabel("Sepal Width")
plt.gray()
plt.figure(figsize=(10, 7))
plt.legend()

plt.scatter(
    df0["x"], df0["y"], color="green", marker="+", label="centroid", linewidth="5"
)
plt.matshow(digits.images[i])
plt.plot(k_rng, sse, color="green")

## Seaborn


In [None]:
import seaborn as sn

sns.heatmap(conf_matrix, annot=True, cmap="Blues", fmt="d")

# log normal distribution
sns.set(rc={"figure.figsize": (11.7, 8.27)})
g = sns.barplot(x="income", y="count", data=df)
g.set_xticklabels(g.get_xticklabels(), rotation=45, horizontalalignment="right")
g.set(xscale="log")

## Scikit-Learn


In [None]:
from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.naive_bayes import GaussianNB, MultinomialNB

from sklearn.datasets import load_digits, load_iris, load_wine
from sklearn.model_selection import train_test_split, KFold,StratifiedKFold,cross_val_score,
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer

In [None]:
## Model

from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(new_df, price)
reg.predict([[3300]])
reg.coef_
reg.intercept_
model.score(X, y)

from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()
model.fit(inputs_n, target)
model.score(inputs_n, target)
model.predict([[2, 1, 0]])


from sklearn.svm import SVC

model_C = SVC(C=1, gamma=10, kernel="linear")
model_C.fit(X_train, y_train)
model_C.score(X_test, y_test)

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=20)
model.fit(X_train, y_train)
model.score(X_test, y_test)

from sklearn.cluster import KMeans

model = KMeans(n_clusters=3)
y_predicted = model.fit_predict(df)

from sklearn.naive_bayes import GaussianNB, MultinomialNB

model = GaussianNB()

from sklearn.linear_model import Lasso

lasso_reg = Lasso(alpha=50, max_iter=100, tol=0.1).fit(train_X, train_y)

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=10).fit(X_train, y_train)

In [None]:
##Preprocessing

scaler = MinMaxScaler()
df[["x", "y"]] = scaler.fit_transform(df[["x", "y"]])

le = LabelEncoder()
dfle.town = le.fit_transform(dfle.town)

ct = ColumnTransformer([("town", OneHotEncoder(), [0])], remainder="passthrough")
X = ct.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=10
)


from sklearn.model_selection import cross_val_score

print(
    np.average(
        cross_val_score(
            LogisticRegression(solver="liblinear", multi_class="ovr"), x, y, cv=10
        )
    )
)
print(np.average(cross_val_score(SVC(gamma="auto"), x, y, cv=10)))
print(np.average(cross_val_score(DecisionTreeClassifier(), x, y, cv=10)))
print(np.average(cross_val_score(RandomForestClassifier(n_estimators=40), x, y, cv=10)))


from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()
X_train_count = v.fit_transform(X_train.values)

In [None]:
# Visualizations

# Confusion Matrix
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_predicted)  # (truth,prediction)
import matplotlib.pyplot as plt
import seaborn as sn

plt.figure(figsize=(4, 3))
plt.xlabel("Predicted")
plt.ylabel("Truth")
sn.heatmap(cm, annot=True)

In [None]:
##Other Functions

from sklearn.pipeline import Pipeline

clf = Pipeline([("vectorizer", CountVectorizer()), ("nb", MultinomialNB())])
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
clf.predict(emails)


import joblib

joblib.dump(model, "model_joblib")
mj = joblib.load("model_joblib")

In [None]:
# Builtin Datasets
iris.feature_names
iris.target_names
iris.data
iris.target
dir(digits)

## Choose Best Model


### Different Models with Different Parameters


In [None]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import RandomizedSearchCV


model_params = {
    "svm": {
        "model": svm.SVC(gamma="auto"),
        "params": {"C": [1, 10, 20], "kernel": ["rbf", "linear"]},
    },
    "random_forest": {
        "model": RandomForestClassifier(),
        "params": {"n_estimators": [1, 5, 10]},
    },
    "logistic_regression": {
        "model": LogisticRegression(solver="liblinear", multi_class="auto"),
        "params": {"C": [1, 5, 10]},
    },
}

scores = []

for model_name, mp in model_params.items():
    clf = RandomizedSearchCV(mp["model"], mp["params"], cv=5, return_train_score=False)
    clf.fit(iris.data, iris.target)
    scores.append(
        {
            "model": model_name,
            "best_score": clf.best_score_,
            "best_params": clf.best_params_,
        }
    )

df = pd.DataFrame(scores, columns=["model", "best_score", "best_params"])
df

### 1 Model with Different Parameters


In [None]:
from sklearn.model_selection import GridSearchCV

rs = GridSearchCV(
    svm.SVC(gamma="auto"),
    {"C": [1, 10, 20], "kernel": ["rbf", "linear"]},
    cv=5,
    return_train_score=False,
)
rs.fit(iris.data, iris.target)
pd.DataFrame(rs.cv_results_)[["param_C", "param_kernel", "mean_test_score"]]