In [2]:
from scipy import stats
from sklearn.linear_model import LinearRegression
import plotly.express as px
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split
import seaborn as sns

In [3]:
df=pd.read_csv("tumodata.csv")
df["sex"] = df["sex"].map({"Female": 0, "Male": 1})
df["smoker"]=df["smoker"].map({"No":0,"Yes":1})
df["time"]=df["time"].map({"Lunch":0,"Dinner":1})
df["day"]=df["day"].map({"Thur":0,"Fri":1,"Sat":2,"Sun":3})
for label in df.columns:
    if label != "tip":
        graf=px.scatter(df,x=label,y="tip",title=f"{label} vs tip")
        graf.show()

In [4]:
df=pd.read_csv("tumodata.csv")
new_df=pd.DataFrame({"total_bill":df["total_bill"],"tip":df["tip"],"sex":df["sex"]})
df1=px.scatter(new_df,x="total_bill",y="tip",color="sex",color_discrete_map={"Female":"pink","Male":"blue"},title="total_bill vs tip")
df1.show()

In [22]:
fig=go.Figure()
fig=make_subplots(rows=1,cols=2)
fig.add_trace(go.Histogram(x=df[df["sex"]=="Female"]["tip"],name="Female",marker_color="red"),row=1,col=1)
fig.add_trace(go.Histogram(x=df[df["sex"]=="Male"]["tip"],name="Male",marker_color="blue"),row=1,col=1)
fig.add_trace(go.Histogram(x=df[df["day"]=="Thur"]["tip"],name="Thursday"),row=1,col=2)
fig.add_trace(go.Histogram(x=df[df["day"]=="Fri"]["tip"],name="Friday"),row=1,col=2)
fig.add_trace(go.Histogram(x=df[df["day"]=="Sat"]["tip"],name="Saturday"),row=1,col=2)
fig.add_trace(go.Histogram(x=df[df["day"]=="Sun"]["tip"],name="Sunday"),row=1,col=2)
fig.show()

In [None]:
df.drop(["sex","smoker","day","time","size"],axis=1,inplace=True)
df.head()

Unnamed: 0,total_bill,tip
0,16.99,1.01
1,10.34,1.66
2,21.01,3.5
3,23.68,3.31
4,24.59,3.61


In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    df["total_bill"],
    df["tip"],
    test_size=0.2,
    random_state=42,
    shuffle=True
)
dict_train = {"total_bill": x_train.values,"tip": y_train.values,"Data":"Training"}
df_train= pd.DataFrame(dict_train)
dict_test={"total_bill":x_test.values,"tip":y_test.values,"Data":"Testing"}
df_test = pd.DataFrame(dict_test)
df_all = pd.concat([df_train, df_test], ignore_index=True)

In [None]:
fig=go.Figure()
mymodel=LinearRegression()
mymodel.fit(x_train.values.reshape(-1,1),y_train.values.reshape(-1,1))
fig=px.scatter(df_all,
                  x="total_bill",
                  y="tip",
                  color="Data",
                  color_discrete_map={"Testing": "red", "Training": "blue"},
                  size='tip',hover_data=['tip'],
               title="Lineral regretion")
y_pred=mymodel.predict(x_train.values.reshape(-1,1))
fig.add_trace(go.Scatter(
    x=x_train.values.flatten(),
    y=y_pred.flatten(),
    mode='lines',
    name='Regression Line',
    line=dict(color='orange')))
slope, intercept, r, p, Standard_Error = stats.linregress(x_train, y_train)
def myfunc(x):
  return slope * x + intercept
fig.show()

In [None]:
print(myfunc(10.5))

2.0483544805102647


In [None]:
myscore = mymodel.score(x_test.values.reshape(-1,1), y_test)
print(f"r2 score{myscore}")
mypredict = mymodel.predict(x_test.values.reshape(-1,1))
print(mypredict)

r2 score0.5449381659234664
[[3.04525623]
 [1.86330727]
 [3.55119456]
 [3.69452593]
 [2.31576375]
 [2.83881627]
 [3.96728338]
 [2.26014262]
 [2.50615915]
 [2.57033737]
 [2.88160176]
 [2.07723468]
 [2.06439904]
 [2.47407003]
 [2.00236009]
 [2.91903905]
 [2.92652651]
 [3.23351235]
 [2.68478854]
 [5.33107064]
 [3.13831465]
 [3.13403611]
 [2.4558862 ]
 [1.94673896]
 [3.16077703]
 [2.17564129]
 [2.02375283]
 [3.62927807]
 [2.68906708]
 [6.07767732]
 [4.99734388]
 [1.75313465]
 [2.83025918]
 [3.09552917]
 [2.74040966]
 [3.50092162]
 [2.21200895]
 [5.53644096]
 [2.33287794]
 [3.35010279]
 [2.04942412]
 [2.47834858]
 [3.48701634]
 [2.03017065]
 [2.03124029]
 [1.25361414]
 [2.05798121]
 [2.92438724]
 [1.73388118]]


In [None]:

df=pd.read_csv("tumodata.csv")
counts, bin_edges = np.histogram(df.total_bill, bins=5)
label=[f"{bin_edges[i]}-{bin_edges[i+1]} $" for i in range(len(counts))]
new_df=pd.DataFrame({"count":counts,"label":label})
fig=px.pie(new_df,values="count",names="label",title="Total bill",hole=.4)
fig.show()
df=px.pie(df,names="day",title="Day",hole=0.4)
df.show()

In [None]:
df=pd.read_csv("tumodata.csv")
df["sex"] = df["sex"].map({"Female": 0, "Male": 1})
df["smoker"]=df["smoker"].map({"No":0,"Yes":1})
df["time"]=df["time"].map({"Lunch":0,"Dinner":1})
df["day"]=df["day"].map({"Thur":0,"Fri":1,"Sat":2,"Sun":3})
matrix = df.corr(numeric_only=True)
fig = px.imshow(matrix,
                x=matrix.columns,
                y=matrix.index,
                color_continuous_scale='RdBu_r',
                text_auto='.2f',
                aspect="auto"
               )
fig.update_layout(
    title='Correlation Matrix',
    xaxis_showgrid=False,
    yaxis_showgrid=False,
    width=1000,
    height=1000,
)