#### Project: Data Science Template <br> Programmer: Dhruv Singh <br> Date Updated: 10/18/2022

# Data Science Project Template

### Step 1: Combine & Clean Data (10%)

In [None]:
# libraries
import pandas as pd
import numpy as np

In [None]:
# read in csv data
df = pd.read_csv('filname.csv')

Basics 1: Tabulation

* Rows / Columns: df.shape
* Data types: df.dtypes
* Missing Data: df.isnull.sum()

Basics 2: Merge / Concat
* Concatenate vertically: df_concat = pd.concat[df1, df2, df3] 
* Merge horizontally: set both indices, df_merge = pd.merge(df1, df2, how = 'inner / left / outer', left_index=True, right_index=True 

Notes:
* Can only merge 2 dfs at a time
* Make sure all columns are aligned before concatenating, can drop or insert 'empty' columns to do so

Basics 3: Cleaning Variables
* Strip out special characters from strings: df["column"] = df["column"].str.replace(',', '')
* Convert data to appropriate types (int, float): df["column"] = df["column"].astype('float/int')
* Convert string to datetime: df.column = pd.to_datetime(df.column)
* Recode Categories: df.loc[condition, 'column'] = value
* Impute missing values: df.fillna(value / 0 / df.median(), inplace=True) # uses column medians
* Drop irrelevant columns: df.drop(columns=['A', 'B'], inplace=True)
* Rename Columns: df = df.rename(columns = {"A_old": "A_new", "B_old": "B_new"})

In [None]:
import datetime

Basics 4: Working with datetime variables
1. Splitting date columns into date and time
* df['date'] = pd.to_datetime(df['datetime']).dt.date
* df['day'] = pd.to_datetime(df['datetime']).dt.weekday
* df['time'] = pd.to_datetime(df['datetime']).dt.time

2. Creating month and year columns
* df['year'] = pd.DatetimeIndex(df['date']).year.fillna(0).astype(int)
* df['month'] = pd.DatetimeIndex(df['date"']).month.fillna(0).astype(int)

3. Creating year-month variable
* df['year_month'] = df['year'].map(str) + '-' + df['month'].map(str)
* df['year_month'] = pd.to_datetime(df['year_month'], format='%Y-%m').dt.strftime('%Y-%m')

### Step 2: EDA, basic (10%)

* Summarize all numeric variables: df.describe().round(2)
* Groupbys: df_agg = df[['col1', 'col2', 'col3']].groupby(['col1', 'col2']).agg(['count', 'sum'])
* Nested headers: df.columns = df.columns.get_level_values(1)
* Reshape, long to wide: (see below)
* Reshape, wide to long: df_long = pd.melt(df, id_vars=df.columns[0:m], value_vars=df.columns[m:n])

Notes: 
* Rename columns after groupby
* In melt, id vars stay fixed, value columns get stacked next to their column headers

![alt text](reshape.png)

Source: https://towardsdatascience.com/reshape-pandas-dataframe-with-pivot-table-in-python-tutorial-and-visualization-2248c2012a31

### Step 3: Visualize Data || Reshape It (15%)

In [None]:
# libraries
import matplotlib.pyplot as plt
import seaborn as sns

#### 1. Bar graph: 1 category

plt.style.use('ggplot') <br>
x = df.index.tolist()

* ticks <br>

x_pos = [i for i, _ in enumerate(x)] <br>
plt.bar(x_pos, df['Enrollments'], color='green') <br>
plt.xlabel("categorical var") <br>
plt.ylabel("numeric var") <br>
plt.title("graph title")

plt.xticks(x_pos, x)

plt.savefig('directory/name.png', dpi=300, bbox_inches='tight') <br>
plt.show()

#### 2. Bar graph: multiple categories

* setting figure size

plt.figure(figsize=(14, 6))

* number of ticks

N = 3
ind = np.arange(N) 

* width of bars

width = 0.1

* axes and title

plt.title(" ──────── graph name ──────── ", fontsize=20 ,fontweight="bold")
plt.xlabel("category", fontsize=15)
plt.ylabel("numeric var", fontsize=15)

* bar chart data

bar1 = plt.bar(ind, df['col1'], width, color = 'tab:green', edgecolor = "black", linewidth=1)  <br>
bar2 = plt.bar(ind-width, df['col2'], width, color = 'tab:cyan', edgecolor = "black", linewidth=1) <br>
bar3 = plt.bar(ind+width, df['col3'], width, color = 'tab:blue', edgecolor = "black", linewidth=1) <br>
bar4 = plt.bar(ind-width*2, df['col4'], width, color = 'tab:purple', edgecolor = "black", linewidth=1) <br>
bar5 = plt.bar(ind+width*2, df['col5'], width, color = 'tab:red', edgecolor = "black", linewidth=1) <br>
bar6 = plt.bar(ind-width*3, df['col6'], width, color = 'tab:brown', edgecolor = "black", linewidth=1) <br>
bar7 = plt.bar(ind+width*3, df['col7'], width, color = 'tab:orange', edgecolor = "black", linewidth=1) <br>
bar8 = plt.bar(ind-width*4, df['col8'], width, color = 'tab:olive', edgecolor = "black", linewidth=1) <br>
bar9 = plt.bar(ind+width*4, df['col9'], width, color = 'tab:gray', edgecolor = "black", linewidth=1)

* Hide the right and top spines

plt.gca().spines.right.set_visible(False)
plt.gca().spines.top.set_visible(False)

* ticks

plt.xticks(ind, df['categorical var'], fontsize=12)
plt.yticks(fontsize=12)

* legend

plt.legend((bar1, bar2, bar3, bar4, bar5, bar6, bar7, bar8, bar9), 
           ('label1', 'label2', 'label3', 'label4', 'label5', 'label6', 'label7', 'label8', 'label9'), 
           loc='lower center',title_fontsize=10, fontsize=10, bbox_to_anchor=(0.5, -0.25), ncol=5, frameon=False)

* saving chart

plt.savefig('directory/name.png', dpi=300, bbox_inches='tight')
plt.show()

#### 3. Histogram

plt.hist(df['variable']) <br>
plt.savefig('name.png', dpi=300, bbox_inches='tight') <br>
plt.title('graph name') <br>
plt.show()

#### 4. Binning Variables

bins = [lower_lim_inclusive, cutpt1, cutpt2, cutpt3, ... cutptn, upper_lim_notinclusive] <br>
labels = ['label1', 'label2', ... 'labeln'] <br>
df['var_binned'] = pd.cut(df['continuous_var'], bins=bins, labels=labels, include_lowest=True)

#### 5. Line Graph

fig, ax = plt.subplots(figsize=(14, 6))

* x axis ticks

xcoords = ['tick1', 'tick2',..., 'tickn']

* axes and title

ax.set_title("───────────── graph name ─────────────", fontsize=20 ,fontweight="bold") <br>
ax.set_xlabel("category", fontsize=15) <br>
ax.set_ylabel("numeric variable", fontsize=15)

* Hide the right and top spines

ax.spines.right.set_visible(False) <br>
ax.spines.top.set_visible(False)

* ticks

ax.tick_params(axis='x', which='major', labelsize=10, rotation = 45) <br>
ax.tick_params(axis='y', which='major', labelsize=10)

* creating plot    

ax.plot(df['datetime_var'], df['numeric var'], label = 'line label', linewidth=3, color='steelblue')

* creating vertical lines

for xc in xcoords:
    ax.axvline(x=pd.to_datetime(xc), color='black', linestyle=':')

* legend

ax.legend(loc='center left',title_fontsize=20, fontsize=20, bbox_to_anchor=(1, 0.5))

* saving out plot

plt.savefig('directory/name.png', dpi=300, bbox_inches='tight') <br>
plt.show()

#### 6. Line Graph: Multiple

* converting string to datetime

df.Date = pd.to_datetime(df.Date)  <br>
fig, ax = plt.subplots(figsize=(14, 6))

* axes and title

ax.set_title(" ──── graph name ──── ", fontsize=20 ,fontweight="bold") <br>
ax.set_xlabel("categorical var", fontsize=15) <br>
ax.set_ylabel("numeric var", fontsize=15)

* Hide the right and top spines

ax.spines.right.set_visible(False) <br>
ax.spines.top.set_visible(False)

* ticks

ax.tick_params(axis='x', which='major', labelsize=12, rotation = 45) <br>
ax.tick_params(axis='y', which='major', labelsize=12)

* creating plot    

ax.plot(df['Date'], df['var1'], label = 'name1', linewidth=2, color='gray') <br>
ax.plot(df['Date'], df2['var2'], label = 'name2)', linewidth=2, color='orange') <br>
ax.plot(df['Date'], df3['var3'], label = 'name3', linewidth=2, color='steelblue')

* limiting x-axis, post 2-13

ax.set_xlim(pd.to_datetime('yyyy-mm-dd'), pd.to_datetime('yyyy-mm-dd'))

* creating vertical lines, annotating

ax.axvline(x=pd.to_datetime('yyyy-mm-dd'), color='black', linestyle='--') <br>
ax.axvline(x=pd.to_datetime('yyyy-mm-dd'), color='black', linestyle='--') <br>
ax.axvline(x=pd.to_datetime('yyyy-mm-dd'), color='black', linestyle='--')

* legend

ax.legend(loc='lower center',title_fontsize=10, fontsize=10, bbox_to_anchor=(0.5, -0.25), ncol=3, frameon=False)

* saving out plot

plt.savefig('directory/2.name.png', dpi=300, bbox_inches='tight')

#### 7. Scatter + Fitted Line

* setting plot dimensions

fig, ax = plt.subplots(figsize=(14, 6))

* creating plot

ax = sns.regplot(x='numeric var 1', y='numeric var 2', data=df, color='steelblue', marker='o', scatter_kws={'s': 70})

* axes and title

ax.set_title("──────── graph name ────────", fontsize=20 ,fontweight="bold") <br>
ax.set_xlabel("label 1", fontsize=15) <br>
ax.set_ylabel("label 2", fontsize=15)

* Hide the right and top spines

ax.spines.right.set_visible(False) <br>
ax.spines.top.set_visible(False)

* ticks

ax.tick_params(axis='x', which='major', labelsize=10) <br>
ax.tick_params(axis='y', which='major', labelsize=10)

* label limits

ax.set_xlim(0, m) <br>
ax.set_ylim(0, n)

* saving out plot

plt.savefig('directory/name.png', dpi=300, bbox_inches='tight') <br>
plt.show()

#### 8. Box Plots

* setting plot dimensions

fig, ax = plt.subplots(figsize=(20, 10))

* creating plot

ax = sns.boxplot(x="categorical variable", y="numeric variable", data=df)

* axes and title

ax.set_title("───────────────── graph name ─────────────────", fontsize=30,fontweight="bold") <br>
ax.set_xlabel("category", fontsize=25,fontweight="bold") <br>
ax.set_ylabel("numeric var", fontsize=25,fontweight="bold")

* Hide the right and top spines

ax.spines.right.set_visible(False) <br>
ax.spines.top.set_visible(False)

* ticks

ax.tick_params(axis='x', which='major', labelsize=20) <br>
ax.tick_params(axis='y', which='major', labelsize=20)

* saving out plot

plt.savefig('2_graphs/8_Region_Distribution.png', dpi=300, bbox_inches='tight') <br>
plt.show()

### Step 4: Advanced EDA (15%)

In [None]:
# libraries
import scikitlearn

* Correlation matrix:

corr = df_num.corr() <br>
corr

* Heatmap:

plt.figure(figsize = (12,12)) <br>
sns.heatmap((corr), annot = True)


* Feature importance regressor / classifier:
* Dimensionality Reduction: 
* Visualize most important features:


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

#### Feature Importance: Random Forest Regressor

* Prepare Data

X = df.drop(columns='target') <br>
y = df['target'].values.reshape(-1,1)

* train test split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.40, random_state=0)

* scaling feaures and target

scaler = StandardScaler() <br>
X_train = scaler.fit_transform(X_train) <br>
X_test = scaler.transform(X_test)

* ensemble method: RandomForestRegressor

feature_names = X.columns <br>
model = RandomForestRegressor(random_state=1)

* feature importance

model.fit(X_train,y_train) <br>
importances = model.feature_importances_

* saving to a dataframe

X_importances = pd.DataFrame({"Importance":importances, "Indicator":feature_names}).sort_values("Importance", ascending=False)

* cumulative sums

X_importances['CumSum'] = X_importances['Importance'].cumsum(axis=0) <br>
X_importances = X_importances.set_index("Indicator")

* displaying top ten most important features

top_ten=X_importances.nlargest(n=10, columns=['Importance', 'CumSum']) <br>
top_ten

#### Feature Importance: Random Forest Classifier

* Prepare Data

X = df.drop(columns='target') <br>
y = df['target'].values.reshape(-1,1)

* train test split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.40, random_state=0)

* scaling feaures and target

scaler = StandardScaler()

* Fitting Standard Scaler

X_scaler = scaler.fit(X_train)

* Scaling data

X_train_scaled = X_scaler.transform(X_train) <br>
X_test_scaled = X_scaler.transform(X_test)

* Fitting the model

model = model.fit(X_train_scaled, y_train)

* Making predictions using the testing data

predictions = model.predict(X_test_scaled)

* Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

* Displaying results

print("Confusion Matrix") <br>
display(cm_df) <br>
print(f"Accuracy Score : {acc_score}") <br>
print("Classification Report") <br>
print(classification_report(y_test, predictions))

* feature importance

importances = model.feature_importances_

* saving to a dataframe

X_importances = pd.DataFrame({"Importance":importances, "Indicator":feature_names}).sort_values("Importance", ascending=False)

* cumulative sums

X_importances['CumSum'] = X_importances['Importance'].cumsum(axis=0) <br>
X_importances = X_importances.set_index("Indicator")

* displaying top ten most important features

top_ten=X_importances.nlargest(n=10, columns=['Importance', 'CumSum']) <br>
top_ten

In [None]:
from sklearn import decomposition
from sklearn.decomposition import PCA

#### PCA + Linear Regression

X = df.drop(columns=['target']) <br>
y = df[['target']]

from sklearn.preprocessing import StandardScaler <br>
sc = StandardScaler()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

X_train = sc.fit_transform(X_train) <br>
X_test = sc.transform(X_test)

pca = PCA(n_components = 100) <br>
X_train = pca.fit_transform(X_train) <br>
X_test = pca.transform(X_test)

regressor = LinearRegression() <br>
regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test) <br>
np.set_printoptions(precision=2)

r2_score(y_pred, y_test)

### Step 5: Model Data (50%)

In [None]:
import scikitlearn

* train test split
* fit model
* use model on X_test to predict y_test
* score y_pred against y_test to measure model performance / evaluate

#### Logisitic Regression
X_train, X_test, y_train, y_test = train_test_split(df.drop('target', axis=1),
                                                   df['training'], test_size=0.2,
                                                   random_state=200)
* fitting model

clf = LogisticRegression(solver='liblinear') <br>
clf.fit(X_train, y_train)

y_pred = LogReg.predict(X_test)


* Model Evaluation: Classification report without cross-validation

print(classification_report(y_test, y_pred))

* K-fold cross-validation & confusion matrices

y_train_pred = cross_val_predict(LogReg, X_train, y_train, cv=5) <br>
confusion_matrix(y_train, y_train_pred)