In [None]:
import pandas as pd
import matplotlib as plt
import numpy as np
import os
import statsmodels.api as sm
from scipy import stats
import seaborn as sns
import glob

def merge_cuffdiff(genefile):
    spath = os.getcwd()
    gene_id = pd.read_csv(genefile)
    for root, dirs, filenames in os.walk(spath):
        for f in filenames:
            if f.endswith('gene_exp.diff'):
                x = pd.read_table(os.path.join(root, f))
                x = x.rename(columns={ x.columns[9]: "logFC" })
                y = os.path.basename(root)
                x.columns = [str(col) + '_' + y for col in x.columns]
                x = x.rename(columns={ x.columns[0]: "gene_id" })
                gene_id = gene_id.merge((x), on = 'gene_id', how = 'left')
    return gene_id

def FPKM_restriction(c):
    vals = []   
    cols = list(c.columns.values)
    for i,y in enumerate(cols):
        if 'value' in y:
            vals.append(y)
        for z in vals:    
            if 'p_value' in z:
                vals.remove(z)
            if 'q_value' in z:
                vals.remove(z)
    for q in vals:  
        c = c[c[q] >= 1] 
        merged_table = c
    return merged_table
        
def split_gene_ID(x):
    new_IDs = x
    new = new_IDs["gene_id"].str.partition(".")
    new.columns = ['gene_id','.','decimal']
    new_IDs['gene_id'] = new['gene_id']
    return new_IDs

def merge_metrics(d):
    for file in glob.glob("METRICS/*.txt"):
        file = pd.read_table(file)
        file = file.rename(columns={ file.columns[0]: "gene_id" })
        d = d.merge((file), on = 'gene_id', how = 'left')
    return d

In [None]:
c = merge_cuffdiff('geneNames.csv')
x = FPKM_restriction(c)
d = split_gene_ID(x)
e = merge_metrics(d)
e.to_csv('/Users/tyler-matheny/Desktop/mergedmet.csv')
e

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
metric_cols = e.columns[56:]
metric_cols
f = e[metric_cols]
f = f.fillna(0)
X_metrics = f

y_FC = e['logFC_STRESSPEL']

X_train, X_test, y_train, y_test = train_test_split(X_metrics, y_FC,
                                                   random_state = 0)
linreg = LinearRegression().fit(X_train, y_train)

print('linear model intercept: {}'
     .format(linreg.intercept_))
print('linear model coeff:\n{}'
     .format(linreg.coef_))
print('R-squared score (training): {:.3f}'
     .format(linreg.score(X_train, y_train)))
print('R-squared score (test): {:.3f}'
     .format(linreg.score(X_test, y_test)))

est = sm.OLS(y_FC.astype(float), X_metrics.astype(float))
est2 = est.fit()
print(est2.summary())

In [None]:
%matplotlib notebook
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

np.set_printoptions(precision=2)



target_names_fruits = ['apple', 'mandarin', 'orange', 'lemon']

X_fruits_2d = fruits[['height', 'width']]
y_fruits_2d = fruits['fruit_label']

X_train, X_test, y_train, y_test = train_test_split(X_fruits, y_fruits, random_state=0)

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
# we must apply the scaling to the test set that we computed for the training set
X_test_scaled = scaler.transform(X_test)

knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train_scaled, y_train)
print('Accuracy of K-NN classifier on training set: {:.2f}'
     .format(knn.score(X_train_scaled, y_train)))
print('Accuracy of K-NN classifier on test set: {:.2f}'
     .format(knn.score(X_test_scaled, y_test)))

example_fruit = [[5.5, 2.2, 10, 0.70]]
example_fruit_scaled = scaler.transform(example_fruit)
print('Predicted fruit type for ', example_fruit, ' is ', 
          target_names_fruits[knn.predict(example_fruit_scaled)[0]-1])

In [None]:
metric_cols

In [None]:
vals = []   
cols = list(c.columns.values)
for i,y in enumerate(cols):
    if 'value' in y:
        vals.append(y)
    for z in vals:    
        if 'p_value' in z:
            vals.remove(z)
        if 'q_value' in z:
            vals.remove(z)
for q in vals:  
    c = c[c[q] >= 1] 
    merged_table = c


In [None]:
for file in glob.glob("METRICS/*.txt"):
    file = pd.read_table(file)
    file = file.rename(columns={ file.columns[0]: "gene_id" })
    d = d.merge((file), on = 'gene_id', how = 'left')
for file in glob.glob("METRICS/*.txt"):
    file = pd.read_table(file)
    file = file.rename(columns={ file.columns[0]: "gene_id" })
    d = d.merge((file), on = 'gene_id', how = 'left')
        #print('file found')

In [None]:
e.to_csv('/Users/tyler-matheny/Desktop/merged.csv')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
sg = pd.read_csv('sgtranscriptome.csv')
sg = sg.rename(index=str, columns={"test_id": "gene_id"})
sg = sg.dropna()
sg = sg.drop(labels='Mito', axis=1)
for file in glob.glob("METRICS/*.txt"):
    file = pd.read_table(file)
    file = file.rename(columns={ file.columns[0]: "gene_id" })
    sg = sg.merge((file), on = 'gene_id', how = 'left')
sg = sg.fillna(0)
metric_cols = sg.columns[13:-1]
X_metrics = sg[metric_cols]
y_FC = sg['Fold change']
y_Loc = sg['Localization']

X_train, X_test, y_train, y_test = train_test_split(X_metrics, y_Loc,
                                                   random_state = 0)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
# we must apply the scaling to the test set that we computed for the training set
X_test_scaled = scaler.transform(X_test)

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 25)
knn.fit(X_train, y_train)
print('Accuracy of K-NN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))
print('Accuracy of K-NN classifier on test set: {:.2f}'
     .format(knn.score(X_test, y_test)))

#Dummy Classifier
from sklearn.dummy import DummyClassifier

# Negative class (0) is most frequent
dummy_majority = DummyClassifier(strategy = 'most_frequent').fit(X_train, y_train)
# Therefore the dummy 'most_frequent' classifier always predicts class 0
y_dummy_predictions = dummy_majority.predict(X_test)

y_dummy_predictions
dummy = dummy_majority.score(X_train, y_train)
print(dummy)
dummy = dummy_majority.score(X_test, y_test)
print(dummy)




#kNN Regression
from sklearn.neighbors import KNeighborsRegressor

X_train, X_test, y_train, y_test = train_test_split(X_metrics, y_FC, random_state = 0)

knnreg = KNeighborsRegressor(n_neighbors = 5).fit(X_train, y_train)

print(knnreg.predict(X_test))
print('R-squared kNN test score: {:.3f}'
     .format(knnreg.score(X_test, y_test)))

X_train, X_test, y_train, y_test = train_test_split(X_metrics, y_FC,
                                                   random_state = 0)

linreg = LinearRegression().fit(X_train, y_train)
print('linear model intercept: {}'
     .format(linreg.intercept_))
print('linear model coeff:\n{}'
     .format(linreg.coef_))
print('R-squared score (training): {:.3f}'
     .format(linreg.score(X_train, y_train)))
print('R-squared score (test): {:.3f}'
     .format(linreg.score(X_test, y_test)))


est = sm.OLS(y_FC, X_metrics)
est2 = est.fit()
print(est2.summary())



In [None]:
sg

In [None]:
%matplotlib notebook
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

np.set_printoptions(precision=2)

X_metrics = sg[metric_cols]
y_FC = sg['Localization']

target_names = ['Nether', 'enriched', 'depleted']

X_train, X_test, y_train, y_test = train_test_split(X_metrics, y_FC, random_state=0)

#from sklearn.preprocessing import MinMaxScaler
#scaler = MinMaxScaler()
#X_train_scaled = scaler.fit_transform(X_train)
# we must apply the scaling to the test set that we computed for the training set
#X_test_scaled = scaler.transform(X_test)

knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train, y_train)
print('Accuracy of K-NN classifier on training set: {:.2f}'
     .format(knn.score(X_train_scaled, y_train)))
print('Accuracy of K-NN classifier on test set: {:.2f}'
     .format(knn.score(X_test_scaled, y_test)))

example_fruit = [[5.5, 2.2, 10, 0.70]]
example_fruit_scaled = scaler.transform(example_fruit)
print('Predicted fruit type for ', example_fruit, ' is ', 
          target_names_fruits[knn.predict(example_fruit_scaled)[0]-1])

In [None]:
from sklearn.cluster import KMeans
model = KMeans(n_clusters=3)
model.fit(X_metrics)


In [None]:
# Importing Modules
from sklearn import datasets
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt



# Defining Model
model = TSNE(learning_rate=100)

# Fitting Model
transformed = model.fit_transform(X_metrics)

# Plotting 2d t-Sne
x_axis = transformed[:, 0]
y_axis = transformed[:, 1]

plt.scatter(x_axis, y_axis)
plt.show()

In [None]:
# Load Python Libraries
import swat
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from time import time
%matplotlib inline



In [None]:
rndperm = np.random.permutation(X_metrics.shape[0])

In [None]:
rndperm

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt


# Plot the graph
plt.gray()
fig = plt.figure( figsize=(16,7) )
for i in range(0,30):
    ax = fig.add_subplot(3,10,i+1)
    ax.matshow(X_metrics.loc[rndperm[i]])
plt.show()