Linear Regression in 3D space

In [1]:
import numpy as np
import matplotlib.pyplot as mpl
from mpl_toolkits.mplot3d import Axes3D
from sklearn import linear_model
from sklearn.datasets.samples_generator import make_regression

In [2]:
# Generating synthetic data for training and testing
X, y = make_regression(n_samples=100, n_features=2, n_informative=1,\
random_state=0, noise=50)

---

# data manipulation

In [4]:
from sklearn import datasets

In [7]:
iris=datasets.load_iris()
iris.data.shape

(150, 4)

In [10]:
from sklearn import preprocessing
X=np.array([[1.,-1.,2.],[2.,0.,0.],[0.,1.,-1.]])
X_scaled=preprocessing.scale(X)
print(X_scaled)

[[ 0.         -1.22474487  1.33630621]
 [ 1.22474487  0.         -0.26726124]
 [-1.22474487  1.22474487 -1.06904497]]


----

# Naive Bayes

In [10]:
import pandas
df=pandas.read_csv("C:/Users/Think/Downloads/000-example.csv",index_col=False,header=0)
print(df)

   Group  F1  F2  F3
0      1   1   1   1
1      0   0   1   0
2      1   0   0   0
3      0   0   0   0
4      0   0   1   0
5      1   0   1   1
6      1   0   0   1


In [11]:
Group=df["Group"].values
print(Group)
Features=df[["F1","F2","F3"]].values
Features=Features.astype(float)
print(Features)

[1 0 1 0 0 1 1]
[[ 1.  1.  1.]
 [ 0.  1.  0.]
 [ 0.  0.  0.]
 [ 0.  0.  0.]
 [ 0.  1.  0.]
 [ 0.  1.  1.]
 [ 0.  0.  1.]]


In [13]:
from sklearn.naive_bayes import BernoulliNB
clf=BernoulliNB()
clf.fit(Features,Group)
BernoulliNB(alpha=1,binarize=0.0,class_prior=None,fit_prior=True)
print(clf.predict(Features))
print(clf.predict_proba(Features))

[1 0 0 0 0 1 1]
[[ 0.1394148   0.8605852 ]
 [ 0.72160356  0.27839644]
 [ 0.63343109  0.36656891]
 [ 0.63343109  0.36656891]
 [ 0.72160356  0.27839644]
 [ 0.24471299  0.75528701]
 [ 0.17763158  0.82236842]]


In [30]:
import numpy as np
X=np.random.randint(5,size=(6,3))
print(X)
y=np.array([1,2,3,4,5,6])
from sklearn.naive_bayes import MultinomialNB
clf=MultinomialNB()
clf.fit(X,y)
MultinomialNB(alpha=1,class_prior=None,fit_prior=True)
print(clf.predict_proba(X))

[[4 1 1]
 [3 2 3]
 [4 3 2]
 [4 4 0]
 [3 2 2]
 [4 2 2]]
[[ 0.29050524  0.10708635  0.15511107  0.10893387  0.1422825   0.19608096]
 [ 0.13231473  0.24487771  0.17882658  0.02075854  0.22144941  0.20177302]
 [ 0.15736656  0.14297318  0.21268474  0.10099989  0.18963185  0.19634379]
 [ 0.07829721  0.03260392  0.12541699  0.61419306  0.0698894   0.07959942]
 [ 0.16133847  0.18247323  0.19382477  0.06187379  0.20001865  0.20047108]
 [ 0.20557822  0.15218721  0.18522932  0.06450535  0.18350258  0.20899733]]


In [19]:
x1=np.array([3,3,2])
print(x1)
print(clf.predict_proba(x1))

[3 3 2]
[[ 0.19394618  0.02948884  0.10985079  0.335238    0.19789623  0.13357996]]




In [31]:
import numpy as np
XX=np.random.randint(5,size=(6,3))
yy=np.array([1,2,3,4,5,6])
clf.partial_fit(XX,yy)
print(clf.predict(X))

[1 6 5 4 1 1]


In [32]:
print(XX)

[[4 2 3]
 [2 0 3]
 [1 4 2]
 [2 1 0]
 [1 2 1]
 [1 3 4]]


In [21]:
measurements=[
    {'city':'Dubai','temperature':33.},
    {'city':'London','temperature':12.},
    {'city':'San Fransisco','temperature':18.},
]
print(measurements)

[{'city': 'Dubai', 'temperature': 33.0}, {'city': 'London', 'temperature': 12.0}, {'city': 'San Fransisco', 'temperature': 18.0}]


In [22]:
from sklearn.feature_extraction import DictVectorizer

In [27]:
vec=DictVectorizer()
vecarray=vec.fit_transform(measurements).toarray()
print(vec.get_feature_names())
print(vecarray)

['city=Dubai', 'city=London', 'city=San Fransisco', 'temperature']
[[  1.   0.   0.  33.]
 [  0.   1.   0.  12.]
 [  0.   0.   1.  18.]]


In [28]:
import pandas

In [37]:
df=pandas.read_csv("C:/Users/Think/Downloads/000-example.csv",index_col=False,header=0)

----

# Detecting spam in real data

In [26]:
import pandas as pd
sms=pd.read_csv("C:/Users/Think/Downloads/spam.csv",encoding = "ISO-8859-1")

In [35]:
sms.head(10)

Unnamed: 0,v1,v2,FREE
0,ham,"Go until jurong point, crazy.. Available only ...",-1
1,ham,Ok lar... Joking wif u oni...,-1
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,0
3,ham,U dun say so early hor... U c already then say...,-1
4,ham,"Nah I don't think he goes to usf, he lives aro...",-1
5,spam,FreeMsg Hey there darling it's been 3 week's n...,0
6,ham,Even my brother is not like to speak with me. ...,-1
7,ham,As per your request 'Melle Melle (Oru Minnamin...,-1
8,spam,WINNER!! As a valued network customer you have...,-1
9,spam,Had your mobile 11 months or more? U R entitle...,103


In [32]:
def replace(x):
    if (x >= 0):
        x = 1
    else:
        x = 0
    return x

In [36]:
def create_index(variable):
    variable = variable.upper()
    sms[variable] = sms["v2"].str.upper().str.find(variable)
    sms[variable] = sms[variable].apply(lambda x: replace(x))
    return

In [40]:
create_index("FREE")
sms["FREE"].head(10)

0    0
1    0
2    1
3    0
4    0
5    1
6    0
7    0
8    0
9    1
Name: FREE, dtype: int64

In [41]:
def count_digits(x):   
    return sum(c.isdigit() for c in x)

In [44]:
str = "123456"  # Only digit in this string
print (str.isdigit())
#The method isdigit() checks whether the string consists of digits only.

True


## The Bernoulli model

In [45]:
create_index("FREE")
create_index("WINNER")
create_index("REMINDER")
create_index("CONTRACT")
create_index("MOBILE")
create_index("URGENT")
create_index("IMPORTANT")
create_index("PRIVATE")
create_index("CALL")
create_index("SEX")
create_index("GUARANTEED")
create_index("CONGRATS")
create_index("DATING")
create_index("CASH")
create_index("CUSTOMER")
create_index("SMS")
create_index("ACCOUNT")
create_index("HEY")
create_index("AWARDED")
create_index("WON")
create_index("CHOSEN")
create_index("WOULD")
create_index("WIND")
create_index("HORNY")
create_index("INVITING")
create_index("CHOSEN")
create_index("SHOPPING")
create_index("ENTITLED")
create_index("WANT")
create_index("ENTRY")
create_index("SECRET")
create_index("DISCOUNT")
create_index("MESSAGE")
create_index("IMPORTANT")
create_index("XXX")
create_index("GIRLS")
create_index("CONTACT")

In [46]:
sms.head()

Unnamed: 0,v1,v2,FREE,WINNER,REMINDER,CONTRACT,MOBILE,URGENT,IMPORTANT,PRIVATE,...,SHOPPING,ENTITLED,WANT,ENTRY,SECRET,DISCOUNT,MESSAGE,XXX,GIRLS,CONTACT
0,ham,"Go until jurong point, crazy.. Available only ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,Ok lar... Joking wif u oni...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,ham,U dun say so early hor... U c already then say...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [50]:
sms["spam"]     = sms["v1"]=="spam"  #生成一组True, False的序列
sms["spam"]     = sms["spam"].astype(int) #生成一组0，1的序列
sms["DIGITS"]   = sms["v2"].apply(lambda x: count_digits(x)) #数每条信息中有多少个数字
sms["EXC_MARK"] = sms["v2"].str.upper().str.count("!") #感叹号的数量

In [72]:
data = sms[["FREE","WINNER","REMINDER","CONTRACT","MOBILE","URGENT","IMPORTANT","PRIVATE","CALL","SEX","GUARANTEED",
            "CONGRATS","DATING","CASH","CUSTOMER","SMS","ACCOUNT","HEY","AWARDED","WON","CHOSEN","WOULD",
            "WIND","HORNY","INVITING","CHOSEN","SHOPPING","ENTITLED","WANT","ENTRY","SECRET","DISCOUNT","MESSAGE",
            "IMPORTANT","XXX","GIRLS","CONTACT","DIGITS","EXC_MARK"]].values.reshape(5572,39)

In [57]:
import numpy as np
from sklearn.naive_bayes import BernoulliNB
from sklearn.cross_validation import cross_val_score

In [66]:
clf=BernoulliNB()
clf.fit(data,sms["spam"].values)
BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
sms["prediction"] = clf.predict(data)

In [67]:
scores = cross_val_score(clf, data, sms["spam"].values, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.95 (+/- 0.01)


## The Multinomial Model

In [78]:
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import cross_val_score
clf = MultinomialNB()
clf.fit(data,sms["spam"].values)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
sms["prediction"] = clf.predict(data)
scores = cross_val_score(clf, data, sms["spam"].values, cv=15)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.96 (+/- 0.02)


-----

# Support Vector Machine

In [9]:
import pandas
import numpy
from sklearn import preprocessing
df=pandas.read_csv('C:/Users/Think/Downloads/001-customer.csv',index_col=False,header=0)
Group=df["Group"].values
Features=df[["F1","F2","F3","F4","F5"]].values
Features=preprocessing.scale(Features)
#记得要先scale!!!

In [6]:
from sklearn.svm import SVC,LinearSVC,NuSVC
clf=SVC(C=0.01,kernel='linear')
clf.fit(Features,Group)
print(clf.decision_function(Features))
print(clf.support_vectors_.shape)#有738个support vectors
print(clf.coef_)

[[-1.37244856 -0.11410806  0.44669314]
 [ 1.18587131  0.78892706 -0.39103837]
 [ 0.97988755  1.00909795  0.32247696]
 ..., 
 [ 2.13104484 -0.82084641 -1.82626684]
 [-0.53606644 -0.21504912  0.28158403]
 [-1.42253286 -0.91923272 -0.06540544]]
(738, 5)
[[ 0.39479716  0.8782536   0.71849264  0.21987291 -0.06223538]
 [-0.25582012  0.04967206 -0.20830175  1.17099646 -0.66257281]
 [-0.12811594 -0.51116172 -0.44270331  0.79791476 -0.43503393]]




In [10]:
predictions=clf.predict(Features)
numpy.sum(predictions==Group)

860

In [11]:
len(df)

1000

In [15]:
clx=NuSVC(nu=0.9,kernel='linear')
clx.fit(Features,Group)
print(clx.decision_function(Features))
print(clx.support_vectors_.shape)
print(clx.coef_)

[[-0.52069622 -0.34449484  0.30484891]
 [ 0.8236462   0.40376707 -0.22724119]
 [ 0.56978364  0.69487727  0.06814763]
 ..., 
 [ 1.30327658 -0.68078824 -0.98075275]
 [-0.26647691  0.0215637   0.40064077]
 [-0.47517518 -0.62009513  0.2103442 ]]
(967, 5)
[[  1.11344329e-01   4.25926683e-01   3.90729869e-01   1.68087240e-02
   -1.99549280e-02]
 [ -3.04260036e-04  -5.89196772e-03  -6.71059679e-04   7.02649450e-01
   -2.48543019e-01]
 [ -3.55171150e-02  -2.66196690e-01  -2.34817500e-01   2.56544598e-01
   -3.61001335e-02]]




In [17]:
predictions=clx.predict(Features)
numpy.sum(predictions==Group)

774

In [27]:
cls=LinearSVC(penalty="l1",dual=False)
cls.fit(Features,Group)
predictions=cls.predict(Features)
numpy.sum(predictions==Group)

857

857