<div class="alert alert-info">
    This is a tutorial on using sql and doing data analysis in python. Some of the cells bellow are hiddened ( <code>...</code>) and you should try to google the answer before unhiding them.
</div>

<div class="alert alert-success">
    A Jupyter notebook is made up of various cells, each containing a piece of text or some code that you can run. You can move from one cell to another by using the arrow keys or by clicking a cell with the mouse. In order to execute the code in a cell you have to press <code>Ctrl-Enter</code> while selecting the code cell. Alternatively, you can press the "<i class="fa fa-step-forward"></i> Run" button at the top of the screen. This also moves to the next cell at the same time. Using <code>Shift-Enter</code> instead of <code>Ctrl-Enter</code> will also execute the code and move to the next cell at the same time.

</div>

* a
* b
* c

In [None]:
import pandas as pd
import sqlalchemy as db

In [None]:
engine = db.create_engine("mssql+pyodbc://laikh@P-CWTS-010260", fast_executemany=True)

In [None]:
%reload_ext sql
%sql mssql+pyodbc://laikh@P-CWTS-010260

In [None]:
%%sql
SELECT name 
FROM master.sys.databases

In [None]:
%%sql
SELECT TOP (5) *
FROM wos_2013.dbo.pub p 
TABLESAMPLE(1000 ROWS)
WHERE pub_year > 2000
ORDER BY NEWID()

In [None]:
sql_query = """
SELECT TOP (10000) *
FROM wos_2013.dbo.pub p 
TABLESAMPLE(100000 ROWS)
WHERE pub_year > 2000
ORDER BY NEWID()
"""

In [None]:
df = pd.read_sql(sql_query, con=engine)

In [None]:
df.groupby(['pub_year']).size().plot()

In [None]:
import seaborn as sns

In [None]:
sns.lineplot(data=df, x="pub_year", y="n_cits", hue="is_open_access")

In [None]:
#https://stackoverflow.com/questions/30327153/seaborn-pylab-changing-xticks-from-float-to-int?rq=1
from  matplotlib.ticker import FuncFormatter

In [None]:
ax = sns.lineplot(data=df, x="pub_year", y="n_cits", hue="is_open_access")
ax.xaxis.set_major_formatter(FuncFormatter(lambda x, _: int(x)))
ax

In [None]:
#https://stackoverflow.com/questions/30914462/matplotlib-how-to-force-integer-tick-labels
from matplotlib.ticker import MaxNLocator

In [None]:
ax = sns.lineplot(data=df, x="pub_year", y="n_cits", hue="is_open_access")
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
ax

In [None]:
#dtrain = xgb.DMatrix(df[["pub_year", "n_refs"]], df["n_cits"])

In [None]:
df["num_year_1"] = 2021 - df["pub_year"]

In [None]:
df["num_year_2"] = df["pub_year"].map(lambda x: 2021 - x)

In [None]:
df["num_year_3"] = df.apply(lambda x: 2021 - x["pub_year"], axis=1)

In [None]:
def compute_num_year(x):
    return 2021 - x["pub_year"]
df["num_year_4"] = df.apply(lambda x: compute_num_year(x), axis=1)

In [None]:
df[["num_year_1", "num_year_2", "num_year_3",  "num_year_4"]].head()

In [None]:
import xgboost as xgb

In [None]:
train_size = int(len(df) * 0.8)
test_size = len(df) - train_size
df_train, df_test = df[:train_size], df[train_size:len(df)].reset_index(drop=True)
df_test["data"] = "test"

In [None]:
x_train_1 = df_train[["num_year_1"]]
y_train_1 = df_train["n_cits"]

x_test_1 = df_test[["num_year_1"]]
y_test_1 = df_test["n_cits"]

In [None]:
reg = xgb.XGBRegressor(tree_method="hist")

In [None]:
reg.fit(x_train_1, y_train_1)

In [None]:
y_pred_1 = reg.predict(x_test_1)
df_pred_1 = df_test.copy()
df_pred_1["data"] = "predicted"
df_pred_1["n_cits"] = y_pred_1

In [None]:
df_pred_test_1 = pd.concat([df_test, df_pred_1]).reset_index(drop=True)

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

In [None]:
sqrt(mean_squared_error(y_pred_1, y_test_1))

In [None]:
ax = sns.lineplot(data=df_pred_test_1, x="num_year_1", y="n_cits", hue="data")
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
ax

In [None]:
x_train_2 = df_train[["num_year_1", "is_open_access"]]
y_train_2 = df_train["n_cits"]

x_test_2 = df_test[["num_year_1", "is_open_access"]]
y_test_2 = df_test["n_cits"]

In [None]:
reg.fit(x_train_2, y_train_2)

In [None]:
y_pred_2 = reg.predict(x_test_2)
df_pred_2 = df_test.copy()
df_pred_2["data"] = "predicted"
df_pred_2["n_cits"] = y_pred_2

In [None]:
df_pred_test_2 = pd.concat([df_test, df_pred_2]).reset_index(drop=True)

In [None]:
sqrt(mean_squared_error(y_pred_2, y_test_2))

In [None]:
sqrt(mean_squared_error(y_pred_1, y_test_1))

In [None]:
ax = sns.lineplot(data=df_pred_test_2, x="num_year_1", y="n_cits", hue="data")
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
ax

In [None]:
if "practical" not in engine.dialect.get_schema_names(engine):
    engine.execute(db.schema.CreateSchema("practical"))
    
df_pred_test_2.to_sql("intro", con=engine, schema="practical", if_exists='replace')