In [1]:
import pandas as pd
import matplotlib.pyplot as pt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.cluster import KMeans

In [2]:
path = "https://raw.githubusercontent.com/edyoda/Data-Scientist-program/master/Practice%20Problems/data/student-mat.csv"

In [3]:
#Function to load the data & show header
def load_and_return_data(path):
    student_data = pd.read_csv(path, sep=';')
    return student_data

In [4]:
load_and_return_data(path).head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [5]:
data = load_and_return_data(path)

In [6]:
#identify columns & dtypes
def identify_cols_dtype(data):
    return data.dtypes

In [7]:
identify_cols_dtype(data)

school        object
sex           object
age            int64
address       object
famsize       object
Pstatus       object
Medu           int64
Fedu           int64
Mjob          object
Fjob          object
reason        object
guardian      object
traveltime     int64
studytime      int64
failures       int64
schoolsup     object
famsup        object
paid          object
activities    object
nursery       object
higher        object
internet      object
romantic      object
famrel         int64
freetime       int64
goout          int64
Dalc           int64
Walc           int64
health         int64
absences       int64
G1             int64
G2             int64
G3             int64
dtype: object

In [8]:
data.school.unique()

array(['GP', 'MS'], dtype=object)

1. school - student's school (binary: "GP" - Gabriel Pereira or "MS" - Mousinho da Silveira)
2. sex - student's sex (binary: "F" - female or "M" - male)
3. age - student's age (numeric: from 15 to 22)
4. address - student's home address type (binary: "U" - urban or "R" - rural)
5. famsize - family size (binary: "LE3" - less or equal to 3 or "GT3" - greater than 3)
6. Pstatus - parent's cohabitation status (binary: "T" - living together or "A" - apart)
7. Medu - mother's education (numeric: 0 - none, 1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education)
8. Fedu - father's education (numeric: 0 - none, 1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education)
9. Mjob - mother's job (nominal: "teacher", "health" care related, civil "services" (e.g. administrative or police), "at_home" or "other")
10. Fjob - father's job (nominal: "teacher", "health" care related, civil "services" (e.g. administrative or police), "at_home" or "other")
11. reason - reason to choose this school (nominal: close to "home", school "reputation", "course" preference or "other")
12. guardian - student's guardian (nominal: "mother", "father" or "other")
13. traveltime - home to school travel time (numeric: 1 - <15 min., 2 - 15 to 30 min., 3 - 30 min. to 1 hour, or 4 - >1 hour)
14. studytime - weekly study time (numeric: 1 - <2 hours, 2 - 2 to 5 hours, 3 - 5 to 10 hours, or 4 - >10 hours)
15. failures - number of past class failures (numeric: n if 1<=n<3, else 4)
16. schoolsup - extra educational support (binary: yes or no)
17. famsup - family educational support (binary: yes or no)
18. paid - extra paid classes within the course subject (Math or Portuguese) (binary: yes or no)
19. activities - extra-curricular activities (binary: yes or no)
20. nursery - attended nursery school (binary: yes or no)
21. higher - wants to take higher education (binary: yes or no)
22. internet - Internet access at home (binary: yes or no)
23. romantic - with a romantic relationship (binary: yes or no)
24. famrel - quality of family relationships (numeric: from 1 - very bad to 5 - excellent)
25. freetime - free time after school (numeric: from 1 - very low to 5 - very high)
26. goout - going out with friends (numeric: from 1 - very low to 5 - very high)
27. Dalc - workday alcohol consumption (numeric: from 1 - very low to 5 - very high)
28. Walc - weekend alcohol consumption (numeric: from 1 - very low to 5 - very high)
29. health - current health status (numeric: from 1 - very bad to 5 - very good)
30. absences - number of school absences (numeric: from 0 to 93)
### these grades are related with the course subject, Math or Portuguese:
1. G1 - first period grade (numeric: from 0 to 20)
2. G2 - second period grade (numeric: from 0 to 20)
3. G3 - final grade (numeric: from 0 to 20, output target)

### Q1. Add average marks section ?

In [9]:
def add_average_marks(data):
    data['average_marks'] = data[['G1','G2','G3']].mean(axis=1)
    return data
data = add_average_marks(data)
data.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3,average_marks
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,3,4,1,1,3,6,5,6,6,5.666667
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,3,3,1,1,3,4,5,5,6,5.333333
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,3,2,2,3,3,10,7,8,10,8.333333
3,GP,F,15,U,GT3,T,4,2,health,services,...,2,2,1,1,5,2,15,14,15,14.666667
4,GP,F,16,U,GT3,T,3,3,other,other,...,3,2,1,2,5,4,6,10,10,8.666667


### Q2. Plot the relationship between romantic & average marks ?

In [10]:
data.scatter(x='romantic',y=)

SyntaxError: invalid syntax (<ipython-input-10-93191adc5855>, line 1)

### Q3. Does the data have any missing values ?

In [37]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 51 columns):
school           395 non-null object
sex              395 non-null object
age              395 non-null int64
address          395 non-null object
famsize          395 non-null object
Pstatus          395 non-null object
Medu             395 non-null int64
Fedu             395 non-null int64
Mjob             395 non-null object
Fjob             395 non-null object
reason           395 non-null object
guardian         395 non-null object
traveltime       395 non-null int64
studytime        395 non-null int64
failures         395 non-null int64
schoolsup        395 non-null object
famsup           395 non-null object
paid             395 non-null object
activities       395 non-null object
nursery          395 non-null object
higher           395 non-null object
internet         395 non-null object
romantic         395 non-null object
famrel           395 non-null int64
freetime      

* There is no missing value in this data

### Q4. Transform all the categorial columns into numberical columns ?

In [11]:
def category_to_numeric(data):
    label_encoding = {}
    for col in data.select_dtypes('O').columns:
        label_encoding[col+'_en'] = LabelEncoder()
        data[col] = label_encoding[col+'_en'].fit_transform(data[col])
    return label_encoding,data
label_encoding,data=category_to_numeric(data)

### Q5. Create a regression for predicting marks based on other columns

In [14]:
target_data = data.average_marks
feature_data = data.drop(columns=['average_marks'])
trainX,testX,trainY,testY = train_test_split(feature_data,target_data)

def create_linear_regression_model():
    lr = LinearRegression()
    return lr

lr = create_linear_regression_model()
lr.fit(trainX,trainY)
lr.predict(testX[:2])

array([12.        ,  9.33333333])

### Q6. Split the model into train & test

In [15]:
def split_train_test(feature_data, target_data, test_size=0.25):
    trainX,testX,trainY,testY = train_test_split(feature_data,target_data,test_size=test_size)
    return trainX,testX,trainY,testY

trainX,testX,trainY,testY = split_train_test(feature_data,target_data)


### Q7. Train the model using training data?

In [16]:
def train_model(lr_model,trainX,trainY):
    lr_model.fit(trainX,trainY)
    return lr_model

In [17]:
lr_model = create_linear_regression_model()
lr_model = train_model(lr_model,trainX,trainY)

### Q8. Test the model using test data ?

In [18]:
def test_model(lr_model,testX,testY):
    y_pred = lr_model.predict(testX)
    print('Mean squared error is:',mean_squared_error(testY,y_pred))
    return y_pred

In [19]:
y_pred = test_model(lr_model,testX,testY)

Mean squared error is: 3.761033810750682e-30


### Q9. Find accuracy of the model using score function ?

In [20]:
def _score(lr_model,testX,testY):
    print('Score:',lr_model.score(testX,testY))

In [21]:
_score(lr_model,testX,testY)

Score: 1.0


### Q10. Create classification model for predicting if internet in available to the student

In [22]:
def classification_model():
    log_r = LogisticRegression()
    return log_r

X = data.drop('internet',axis=1)
Y = data['internet']
log_r = classification_model()

### Q11. Split data for training & testing ?

In [23]:
trainX,testX,trainY,testY = split_train_test(X,Y)

### Q12. Train classification model

In [24]:
def train_classification(log_r,trainX,trainY):
    log_r.fit(trainX,trainY)
    return log_r
log_r = train_classification(log_r, trainX , trainY)




### Q13. Find score, test for few more models (LogisticRegression, DecisionTree, RandomForest )

In [25]:
def logistic_regression(trainX,trainY,testX,testY):
    log_r = LogisticRegression()
    log_r.fit(trainX,trainY)
    print('Logistic Regression Score',log_r.score(testX,testY))

def decision_tree(trainX,trainY,testX,testY):
    decision_tree = DecisionTreeClassifier()
    decision_tree.fit(trainX,trainY)
    print('Decision Tree Score:',decision_tree.score(testX,testY))

def random_forest(trainX,trainY,testX,testY):
    random_forest = RandomForestClassifier(n_estimators=10)
    random_forest.fit(trainX,trainY)
    print('Random Forest Score:',random_forest.score(testX,testY))
    
logistic_regression(trainX,trainY,testX,testY)
decision_tree(trainX,trainY,testX,testY)
random_forest(trainX,trainY,testX,testY)




Logistic Regression Score 0.8686868686868687
Decision Tree Score: 0.696969696969697
Random Forest Score: 0.797979797979798


### Q14. Cluster the students into 3 groups using KMeans

In [26]:
def cluster_group(data):
    kmeans = KMeans(n_clusters=3)
    kmeans.fit(data)
    print('Cluster centers are:',kmeans.cluster_centers_,'\n', end='\n')
    print('Cluster labels are:',kmeans.labels_)
    return kmeans

kmeans = cluster_group(data)

Cluster centers are: [[ 0.13375796  0.40127389 16.72611465  0.7388535   0.24203822  0.92356688
   2.47133758  2.29936306  2.01273885  2.25477707  1.02547771  0.88535032
   1.5477707   2.00636943  0.54140127  0.1910828   0.64968153  0.42038217
   0.50318471  0.79617834  0.91082803  0.78980892  0.33757962  4.00636943
   3.20382166  3.2611465   1.4522293   2.30573248  3.56687898  3.14649682
   8.11464968  7.62420382  6.70063694  7.47983015]
 [ 0.12154696  0.55248619 16.48618785  0.79005525  0.31491713  0.88950276
   2.92265193  2.68508287  2.27624309  2.32044199  1.40331492  0.79005525
   1.38674033  2.14917127  0.08287293  0.0718232   0.55801105  0.50276243
   0.53038674  0.78453039  0.98895028  0.84530387  0.28176796  3.95027624
   3.27624309  2.91712707  1.43646409  2.13259669  3.5801105   3.39226519
  13.51933702 13.6519337  13.80662983 13.65930018]
 [ 0.05263158  0.42105263 17.28070175  0.84210526  0.33333333  0.84210526
   2.96491228  2.61403509  2.26315789  2.22807018  1.42105263  