In [180]:
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression

In [181]:
doctor_training_data = pd.read_excel('./data/doctors_fees_training_data.xlsx') # import xlsx file

In [182]:
doctor_training_data.tail() #check data

Unnamed: 0,Qualification,Experience,Rating,Place,Profile,Miscellaneous_Info,Fees
5956,"MBBS, MS - ENT",19 years experience,98%,"Basavanagudi, Bangalore",ENT Specialist,"98% 45 Feedback Basavanagudi, Bangalore",300
5957,MBBS,33 years experience,,"Nungambakkam, Chennai",General Medicine,,100
5958,MBBS,41 years experience,97%,"Greater Kailash Part 2, Delhi",General Medicine,"97% 11 Feedback Greater Kailash Part 2, Delhi",600
5959,"MBBS, MD - General Medicine",15 years experience,90%,"Vileparle West, Mumbai",General Medicine,General Medical Consultation Viral Fever Treat...,100
5960,"BDS, MDS - Orthodontics",17 years experience,100%,"Pitampura, Delhi",Dentist,Acrylic Partial Denture Crowns and Bridges Fix...,200


In [183]:
#dummy_place = pd.get_dummies(doctor_training_data.Place) #1 hot encoding
#dummy_qualification = pd.get_dummies(doctor_training_data.Qualification) #1 hot encoding
#dummy_profile = pd.get_dummies(doctor_training_data.Profile) #1 hot encoding

doctor_training_data['Place'] = pd.factorize(doctor_training_data['Place'])[0]
doctor_training_data['Qualification'] = pd.factorize(doctor_training_data['Qualification'])[0]
doctor_training_data['Profile'] = pd.factorize(doctor_training_data['Profile'])[0]

In [184]:
#merged = pd.concat([doctor_training_data, dummy_place, dummy_qualification, dummy_profile], axis="columns") #merge new columns
final = doctor_training_data

In [185]:
#final = merged.drop(['Qualification', 'Place', 'Miscellaneous_Info', 'Profile'], axis="columns") #drop old our new columns

In [186]:
#Drop 1 dummy column of each 1hotencoded to avoid the dummy trap
#final = final.drop(['Old City, Hyderabad', 'BSc - Zoology, BAMS', 'ENT Specialist'], axis="columns")

In [187]:
experience = final['Experience'].apply(lambda x: x.split('years experience')[0]) # Parse out the number from the experience

In [188]:
final = final.drop(['Experience'], axis="columns") #drop experience column
final = pd.concat([final, experience], axis="columns") #concat our parsed experience column


In [189]:
final = final.dropna() #Drop NaN rows - these will probably cause the model to throw an error

In [190]:
rating = final['Rating'].apply(lambda x: x.split('%')[0]) #process the rating column to remove the %

In [191]:
final = final.drop('Rating', axis="columns") #drop rating column

In [192]:
final = pd.concat([final, rating], axis="columns") #concat our parsed rating column

In [193]:
final = final.drop('Miscellaneous_Info', axis="columns") #drop this feedback column - hard to process it
final

Unnamed: 0,Qualification,Place,Profile,Fees,Experience,Rating
0,0,0,0,100,24,100
1,1,1,1,350,12,98
4,4,4,1,250,20,100
7,6,7,3,200,10,99
12,10,12,3,200,9,98
...,...,...,...,...,...,...
5953,1419,52,0,100,35,92
5956,15,200,2,300,19,98
5958,16,203,4,600,41,97
5959,7,275,4,100,15,90


In [194]:
model = LinearRegression() #Linear Regression model

In [195]:
x = final.drop('Fees', axis="columns") #drop Fees - this is the column we will be predicting

In [196]:
y = final.Fees #column we will be predicting

In [197]:
model.fit(x, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [198]:
final.shape

(2577, 6)

In [112]:
doctor_test_data =  pd.read_excel('./data/doctors_fees_test_data.xlsx') # import xlsx file

In [201]:
# need to process test data similar to train data so we can predict
#dummy_test_place = pd.get_dummies(doctor_test_data.Place) #1 hot encoding
#dummy_test_qualification = pd.get_dummies(doctor_test_data.Qualification) #1 hot encoding
#dummy_test_profile = pd.get_dummies(doctor_test_data.Profile) #1 hot encoding

doctor_test_data['Place'] = pd.factorize(doctor_test_data['Place'])[0]
doctor_test_data['Qualification'] = pd.factorize(doctor_test_data['Qualification'])[0]
doctor_test_data['Profile'] = pd.factorize(doctor_test_data['Profile'])[0]
final_test = doctor_test_data

In [202]:
#merged_test = pd.concat([doctor_test_data, dummy_test_place, dummy_test_qualification, dummy_test_profile], axis="columns") #merge new columns
#final_test = merged_test.drop(['Qualification', 'Place', 'Miscellaneous_Info', 'Profile'], axis="columns") #drop old our new columns

In [203]:
experience_test = final_test['Experience'].apply(lambda x: x.split('years experience')[0]) # Parse out the number from the experience
final_test = final_test.drop(['Experience'], axis="columns") #drop experience column
final_test = pd.concat([final_test, experience_test], axis="columns") #concat our parsed experience column

In [204]:
final_test = final_test.dropna()

In [206]:
final_test = final_test.drop(['Miscellaneous_Info'], axis="columns")

In [207]:
rating_test = final_test['Rating'].apply(lambda x: x.split('%')[0]) #process the rating column to remove the %
final_test = final_test.drop('Rating', axis="columns") #drop rating column
final_test = pd.concat([final_test, rating_test], axis="columns") #concat our parsed rating column


In [208]:
final_test

Unnamed: 0,Qualification,Place,Profile,Experience,Rating
2,2,2,2,40,70
4,4,4,4,16,100
5,5,5,4,14,90
6,1,6,1,23,94
7,6,7,4,9,94
...,...,...,...,...,...
1972,92,457,2,15,93
1976,30,93,1,11,84
1979,238,46,4,17,100
1980,263,5,4,18,98


In [212]:
# try to predict the final_test Fees
test_x = final_test
test_y = model.predict(test_x)

In [213]:
model.score(test_x, test_y)

1.0