change as_matrix to values

bob7783 · bob7783 · commit b8b97e32891b · 2018-08-04T14:56:08.000-04:00
diff --git a/ab_testing/client.py b/ab_testing/client.py
@@ -16,8 +16,8 @@
 df = pd.read_csv('advertisement_clicks.csv')
 a = df[df['advertisement_id'] == 'A']
 b = df[df['advertisement_id'] == 'B']
-a = a['action'].as_matrix()
-b = b['action'].as_matrix()
+a = a['action'].values
+b = b['action'].values
 
 print("a.mean:", a.mean())
 print("b.mean:", b.mean())
diff --git a/ann_logistic_extra/process.py b/ann_logistic_extra/process.py
@@ -21,7 +21,7 @@ def get_data():
   # df.head()
 
   # easier to work with numpy array
-  data = df.as_matrix()
+  data = df.values
 
   # shuffle it
   np.random.shuffle(data)
diff --git a/cnn_class2/class_activation_maps.py b/cnn_class2/class_activation_maps.py
@@ -6,14 +6,10 @@
 # Note: you may need to update your version of future
 # sudo pip install -U future
 
-from keras.layers import Input, Lambda, Dense, Flatten
 from keras.models import Model
 from keras.applications.resnet50 import ResNet50, preprocess_input, decode_predictions
-# from keras.applications.inception_v3 import InceptionV3, preprocess_input
 from keras.preprocessing import image
-from keras.preprocessing.image import ImageDataGenerator
 
-from sklearn.metrics import confusion_matrix
 import numpy as np
 import scipy as sp
 import matplotlib.pyplot as plt
@@ -24,7 +20,7 @@
 
 
 
-# useful for getting number of files
+# get the image files
 image_files = glob('../large_files/256_ObjectCategories/*/*.jp*g')
 image_files += glob('../large_files/101_ObjectCategories/*/*.jp*g')
 
@@ -72,6 +68,7 @@
   cam = fmaps.dot(w)
 
   # upsample to 224 x 224
+  # 7 x 32 = 224
   cam = sp.ndimage.zoom(cam, (32, 32), order=1)
 
   plt.subplot(1,2,1)
diff --git a/cnn_class2/fashion.py b/cnn_class2/fashion.py
@@ -26,7 +26,7 @@ def y2indicator(Y):
 # get the data
 # https://www.kaggle.com/zalando-research/fashionmnist
 data = pd.read_csv('../large_files/fashionmnist/fashion-mnist_train.csv')
-data = data.as_matrix()
+data = data.values
 np.random.shuffle(data)
 
 X = data[:, 1:].reshape(-1, 28, 28, 1) / 255.0
diff --git a/cnn_class2/fashion2.py b/cnn_class2/fashion2.py
@@ -26,7 +26,7 @@ def y2indicator(Y):
 # get the data
 # https://www.kaggle.com/zalando-research/fashionmnist
 data = pd.read_csv('../large_files/fashionmnist/fashion-mnist_train.csv')
-data = data.as_matrix()
+data = data.values
 np.random.shuffle(data)
 
 X = data[:, 1:].reshape(-1, 28, 28, 1) / 255.0
diff --git a/hmm_class/hmmd.py b/hmm_class/hmmd.py
@@ -10,6 +10,7 @@
 
 import numpy as np
 import matplotlib.pyplot as plt
+from datetime import datetime
 
 
 def random_normalized(d1, d2):
@@ -22,6 +23,7 @@ def __init__(self, M):
         self.M = M # number of hidden states
     
     def fit(self, X, max_iter=30):
+        t0 = datetime.now()
         np.random.seed(123)
         # train the HMM model using the Baum-Welch algorithm
         # a specific instance of the expectation-maximization algorithm
@@ -136,6 +138,8 @@ def fit(self, X, max_iter=30):
         print("B:", self.B)
         print("pi:", self.pi)
 
+        print("Fit duration:", (datetime.now() - t0))
+
         plt.plot(costs)
         plt.show()
 
diff --git a/linear_regression_class/systolic.py b/linear_regression_class/systolic.py
@@ -20,7 +20,7 @@
 import pandas as pd
 
 df = pd.read_excel('mlr02.xls')
-X = df.as_matrix()
+X = df.values
 
 # using age to predict systolic blood pressure
 plt.scatter(X[:,1], X[:,0])
diff --git a/nlp_class/nb.py b/nlp_class/nb.py
@@ -18,7 +18,7 @@
 #       it will work for other types of "counts", like tf-idf, so it should
 #       also work for our "word proportions"
 
-data = pd.read_csv('spambase.data').as_matrix() # use pandas for convenience
+data = pd.read_csv('spambase.data').values # use pandas for convenience
 np.random.shuffle(data) # shuffle each row in-place, but preserve the row
 
 X = data[:,:48]
diff --git a/nlp_class/spam2.py b/nlp_class/spam2.py
@@ -14,6 +14,7 @@
 from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
 from sklearn.model_selection import train_test_split
 from sklearn.naive_bayes import MultinomialNB
+from sklearn.svm import SVC
 from wordcloud import WordCloud
 
 
@@ -32,7 +33,7 @@
 
 # create binary labels
 df['b_labels'] = df['labels'].map({'ham': 0, 'spam': 1})
-Y = df['b_labels'].as_matrix()
+Y = df['b_labels'].values
 
 # try multiple ways of calculating features
 # tfidf = TfidfVectorizer(decode_error='ignore')
@@ -49,7 +50,7 @@
 model.fit(Xtrain, Ytrain)
 print("train score:", model.score(Xtrain, Ytrain))
 print("test score:", model.score(Xtest, Ytest))
-
+exit()
 
 
 # visualize the data
diff --git a/nlp_class3/bilstm_mnist.py b/nlp_class3/bilstm_mnist.py
@@ -28,7 +28,7 @@ def get_mnist(limit=None):
 
   print("Reading in and transforming data...")
   df = pd.read_csv('../large_files/train.csv')
-  data = df.as_matrix()
+  data = df.values
   np.random.shuffle(data)
   X = data[:, 1:].reshape(-1, 28, 28) / 255.0 # data is from 0..255
   Y = data[:, 0]
diff --git a/supervised_class/bayes.py b/supervised_class/bayes.py
@@ -9,6 +9,7 @@
 
 
 import numpy as np
+import matplotlib.pyplot as plt
 from util import get_data
 from datetime import datetime
 from scipy.stats import norm
@@ -60,3 +61,9 @@ def predict(self, X):
     t0 = datetime.now()
     print("Test accuracy:", model.score(Xtest, Ytest))
     print("Time to compute test accuracy:", (datetime.now() - t0), "Test size:", len(Ytest))
+
+    # plot the mean of each class
+    for c, g in iteritems(model.gaussians):
+        plt.imshow(g['mean'].reshape(28, 28))
+        plt.title(c)
+        plt.show()
diff --git a/supervised_class/util.py b/supervised_class/util.py
@@ -12,7 +12,7 @@
 def get_data(limit=None):
     print("Reading in and transforming data...")
     df = pd.read_csv('../large_files/train.csv')
-    data = df.as_matrix()
+    data = df.values
     np.random.shuffle(data)
     X = data[:, 1:] / 255.0 # data is from 0..255
     Y = data[:, 0]
diff --git a/supervised_class2/rf_classification.py b/supervised_class2/rf_classification.py
@@ -55,7 +55,7 @@ def transform(self, df):
     X = np.zeros((N, self.D))
     i = 0
     for col, scaler in iteritems(self.scalers):
-      X[:,i] = scaler.transform(df[col].as_matrix().reshape(-1, 1)).flatten()
+      X[:,i] = scaler.transform(df[col].values.reshape(-1, 1)).flatten()
       i += 1
 
     for col, encoder in iteritems(self.labelEncoders):
@@ -98,7 +98,7 @@ def get_data():
   transformer = DataTransformer()
 
   X = transformer.fit_transform(df)
-  Y = df[0].as_matrix()
+  Y = df[0].values
   return X, Y
 
 
diff --git a/supervised_class2/rf_regression.py b/supervised_class2/rf_regression.py
@@ -44,7 +44,7 @@ def fit(self, df):
     self.scalers = {}
     for col in NUMERICAL_COLS:
       scaler = StandardScaler()
-      scaler.fit(df[col].as_matrix().reshape(-1, 1))
+      scaler.fit(df[col].values.reshape(-1, 1))
       self.scalers[col] = scaler
 
   def transform(self, df):
@@ -53,7 +53,7 @@ def transform(self, df):
     X = np.zeros((N, D))
     i = 0
     for col, scaler in iteritems(self.scalers):
-      X[:,i] = scaler.transform(df[col].as_matrix().reshape(-1, 1)).flatten()
+      X[:,i] = scaler.transform(df[col].values.reshape(-1, 1)).flatten()
       i += 1
     for col in NO_TRANSFORM:
       X[:,i] = df[col]
@@ -96,9 +96,9 @@ def get_data():
   df_test = df.loc[test_idx]
 
   Xtrain = transformer.fit_transform(df_train)
-  Ytrain = np.log(df_train['medv'].as_matrix())
+  Ytrain = np.log(df_train['medv'].values)
   Xtest = transformer.transform(df_test)
-  Ytest = np.log(df_test['medv'].as_matrix())
+  Ytest = np.log(df_test['medv'].values)
   return Xtrain, Ytrain, Xtest, Ytest
 
 
diff --git a/unsupervised_class/kmeans_mnist.py b/unsupervised_class/kmeans_mnist.py
@@ -16,13 +16,13 @@
 import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
-from .kmeans import plot_k_means, get_simple_data
+from kmeans import plot_k_means, get_simple_data
 from datetime import datetime
 
 def get_data(limit=None):
     print("Reading in and transforming data...")
     df = pd.read_csv('../large_files/train.csv')
-    data = df.as_matrix()
+    data = df.values
     np.random.shuffle(data)
     X = data[:, 1:] / 255.0 # data is from 0..255
     Y = data[:, 0]
diff --git a/unsupervised_class2/util.py b/unsupervised_class2/util.py
@@ -23,7 +23,7 @@ def getKaggleMNIST():
     # column 0 is labels
     # column 1-785 is data, with values 0 .. 255
     # total size of CSV: (42000, 1, 28, 28)
-    train = pd.read_csv('../large_files/train.csv').as_matrix().astype(np.float32)
+    train = pd.read_csv('../large_files/train.csv').values.astype(np.float32)
     train = shuffle(train)
 
     Xtrain = train[:-1000,1:] / 255